From 6667375b63f73f0c37f7049dce5f2c1320b22e90 Mon Sep 17 00:00:00 2001 From: "Adam J. Kunen" Date: Tue, 5 Nov 2019 08:50:29 -0800 Subject: [PATCH 001/593] Initial working simd register type, and some unit testing --- include/RAJA/pattern/vector.hpp | 461 ++++++++++++++++++++++++++++++++ test/unit/CMakeLists.txt | 4 + test/unit/test-vector.cpp | 227 ++++++++++++++++ 3 files changed, 692 insertions(+) create mode 100644 include/RAJA/pattern/vector.hpp create mode 100644 test/unit/test-vector.cpp diff --git a/include/RAJA/pattern/vector.hpp b/include/RAJA/pattern/vector.hpp new file mode 100644 index 0000000000..c2323f985c --- /dev/null +++ b/include/RAJA/pattern/vector.hpp @@ -0,0 +1,461 @@ +/*! + ****************************************************************************** + * + * \file + * + * \brief RAJA header file defining vector operations. + * + ****************************************************************************** + */ + +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-19, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#ifndef RAJA_pattern_vector_HPP +#define RAJA_pattern_vector_HPP + +#include "RAJA/config.hpp" + +#include "RAJA/util/macros.hpp" + + +// Include SIMD intrinsics header file +#include + +namespace RAJA +{ + + +/*! + * \file + * Vector operation functions in the namespace RAJA + + * + */ + + template + class SimdRegister; + + template<> + class SimdRegister{ + public: + using self_type = SimdRegister; + using element_type = double; + + static constexpr size_t s_num_elem = 4; + static constexpr size_t s_byte_width = s_num_elem*sizeof(double); + static constexpr size_t s_bit_width = s_byte_width*8; + + using simd_type = __m256d; + + private: + simd_type m_value; + + public: + + /*! + * @brief Default constructor, zeros register contents + */ + SimdRegister() : m_value(_mm256_setzero_pd()) { + } + + /*! + * @brief Copy constructor from underlying simd register + */ + SimdRegister(simd_type const &c) : m_value(c) {} + + + /*! + * @brief Copy constructor + */ + SimdRegister(self_type const &c) : m_value(c.m_value) {} + + /*! + * @brief Get scalar value from vector register + * @param i Offset of scalar to get + * @return Returns scalar value at i + */ + template + constexpr + RAJA_INLINE + element_type operator[](IDX i) const + {return m_value[i];} + + + /*! + * @brief Set scalar value in vector register + * @param i Offset of scalar to set + * @param value Value of scalar to set + */ + template + RAJA_INLINE + void set(IDX i, element_type value) + {m_value[i] = value;} + + /*! + * @brief Set entire vector to a single scalar value + * @param value Value to set all vector elements to + */ + RAJA_INLINE + self_type const &operator=(element_type value) + { + m_value = _mm256_set1_pd(value); + return *this; + } + + /*! + * @brief Assign one register to antoher + * @param x Vector to copy + * @return Value of (*this) + */ + RAJA_INLINE + self_type const &operator=(self_type const &x) + { + m_value = x.m_value; + return *this; + } + + + /*! + * @brief Add two vector registers + * @param x Vector to add to this register + * @return Value of (*this)+x + */ + RAJA_INLINE + self_type operator+(self_type const &x) const + { + return self_type(_mm256_add_pd(m_value, x.m_value)); + } + + /*! + * @brief Add a vector to this vector + * @param x Vector to add to this register + * @return Value of (*this)+x + */ + RAJA_INLINE + self_type const &operator+=(self_type const &x) + { + m_value = _mm256_add_pd(m_value, x.m_value); + return *this; + } + + /*! + * @brief Subtract two vector registers + * @param x Vector to subctract from this register + * @return Value of (*this)+x + */ + RAJA_INLINE + self_type operator-(self_type const &x) const + { + return self_type(_mm256_sub_pd(m_value, x.m_value)); + } + + /*! + * @brief Subtract a vector from this vector + * @param x Vector to subtract from this register + * @return Value of (*this)+x + */ + RAJA_INLINE + self_type const &operator-=(self_type const &x) + { + m_value = _mm256_sub_pd(m_value, x.m_value); + return *this; + } + + /*! + * @brief Multiply two vector registers, element wise + * @param x Vector to subctract from this register + * @return Value of (*this)+x + */ + RAJA_INLINE + self_type operator*(self_type const &x) const + { + return self_type(_mm256_mul_pd(m_value, x.m_value)); + } + + /*! + * @brief Multiply a vector with this vector + * @param x Vector to multiple with this register + * @return Value of (*this)+x + */ + RAJA_INLINE + self_type const &operator*=(self_type const &x) + { + m_value = _mm256_mul_pd(m_value, x.m_value); + return *this; + } + + /*! + * @brief Divide two vector registers, element wise + * @param x Vector to subctract from this register + * @return Value of (*this)+x + */ + RAJA_INLINE + self_type operator/(self_type const &x) const + { + return self_type(_mm256_div_pd(m_value, x.m_value)); + } + + /*! + * @brief Divide this vector by another vector + * @param x Vector to divide by + * @return Value of (*this)+x + */ + RAJA_INLINE + self_type const &operator/=(self_type const &x) + { + m_value = _mm256_div_pd(m_value, x.m_value); + return *this; + } + + /*! + * @brief Sum the elements of this vector + * @return Sum of the values of the vectors scalar elements + */ + RAJA_INLINE + element_type sum() const + { + auto hsum = _mm256_hadd_pd(m_value, m_value); + return hsum[0] + hsum[2]; + } + + /*! + * @brief Dot product of two vectors + * @param x Other vector to dot with this vector + * @return Value of (*this) dot x + */ + RAJA_INLINE + element_type dot(self_type const &x) const + { + return self_type(_mm256_mul_pd(m_value, x.m_value)).sum(); + } + }; + + + /** + * A specialization for a single element SIMD register. + * We will implement this as a scalar value, and let the compiler use + * whatever registers it deems appropriate. + */ + template + class SimdRegister{ + public: + using self_type = SimdRegister; + using element_type = T; + + static constexpr size_t s_num_elem = 1; + static constexpr size_t s_byte_width = sizeof(T); + static constexpr size_t s_bit_width = s_byte_width*8; + + private: + T m_value; + + public: + + /*! + * @brief Default constructor, zeros register contents + */ + SimdRegister() : m_value(0) { + } + + /*! + * @brief Copy constructor from underlying simd register + */ + SimdRegister(T const &c) : m_value(c) {} + + + /*! + * @brief Copy constructor + */ + SimdRegister(self_type const &c) : m_value(c.m_value) {} + + /*! + * @brief Get scalar value from vector register + * @param i Offset of scalar to get + * @return Returns scalar value at i + */ + template + constexpr + RAJA_INLINE + element_type operator[](IDX) const + {return m_value;} + + + /*! + * @brief Set scalar value in vector register + * @param i Offset of scalar to set + * @param value Value of scalar to set + */ + template + RAJA_INLINE + void set(IDX , element_type value) + {m_value = value;} + + /*! + * @brief Set entire vector to a single scalar value + * @param value Value to set all vector elements to + */ + RAJA_INLINE + self_type const &operator=(element_type value) + { + m_value = value; + return *this; + } + + /*! + * @brief Assign one register to antoher + * @param x Vector to copy + * @return Value of (*this) + */ + RAJA_INLINE + self_type const &operator=(self_type const &x) + { + m_value = x.m_value; + return *this; + } + + + /*! + * @brief Add two vector registers + * @param x Vector to add to this register + * @return Value of (*this)+x + */ + RAJA_INLINE + self_type operator+(self_type const &x) const + { + return self_type(m_value + x.m_value); + } + + /*! + * @brief Add a vector to this vector + * @param x Vector to add to this register + * @return Value of (*this)+x + */ + RAJA_INLINE + self_type const &operator+=(self_type const &x) + { + m_value = m_value + x.m_value; + return *this; + } + + /*! + * @brief Subtract two vector registers + * @param x Vector to subctract from this register + * @return Value of (*this)+x + */ + RAJA_INLINE + self_type operator-(self_type const &x) const + { + return self_type(m_value - x.m_value); + } + + /*! + * @brief Subtract a vector from this vector + * @param x Vector to subtract from this register + * @return Value of (*this)+x + */ + RAJA_INLINE + self_type const &operator-=(self_type const &x) + { + m_value = m_value - x.m_value; + return *this; + } + + /*! + * @brief Multiply two vector registers, element wise + * @param x Vector to subctract from this register + * @return Value of (*this)+x + */ + RAJA_INLINE + self_type operator*(self_type const &x) const + { + return self_type(m_value * x.m_value); + } + + /*! + * @brief Multiply a vector with this vector + * @param x Vector to multiple with this register + * @return Value of (*this)+x + */ + RAJA_INLINE + self_type const &operator*=(self_type const &x) + { + m_value = m_value * x.m_value; + return *this; + } + + /*! + * @brief Divide two vector registers, element wise + * @param x Vector to subctract from this register + * @return Value of (*this)+x + */ + RAJA_INLINE + self_type operator/(self_type const &x) const + { + return self_type(m_value / x.m_value); + } + + /*! + * @brief Divide this vector by another vector + * @param x Vector to divide by + * @return Value of (*this)+x + */ + RAJA_INLINE + self_type const &operator/=(self_type const &x) + { + m_value = m_value / x.m_value; + return *this; + } + + /*! + * @brief Sum the elements of this vector + * @return Sum of the values of the vectors scalar elements + */ + RAJA_INLINE + element_type sum() const + { + return m_value; + } + + /*! + * @brief Dot product of two vectors + * @param x Other vector to dot with this vector + * @return Value of (*this) dot x + */ + RAJA_INLINE + element_type dot(self_type const &x) const + { + return m_value*x.m_value; + } + }; + + + +// template +// struct FixedVector { +// public: +// using self_type = FixedVector; +// using element_type = typename REGISTER::element_type; +// +// static constexpr size_t s_bit_width = 256; +// static constexpr size_t s_byte_width = s_bit_width/8; +// static constexpr size_t s_num_elem = s_byte_width / sizeof(double); +// +// using register_type = REGISTER; +// +// private: +// register_type m_values[s_num_registers]; +// +// public: +// }; + + + + +} // namespace RAJA + +#endif diff --git a/test/unit/CMakeLists.txt b/test/unit/CMakeLists.txt index 9fffa1be08..9878c514ab 100644 --- a/test/unit/CMakeLists.txt +++ b/test/unit/CMakeLists.txt @@ -72,6 +72,10 @@ raja_add_test( raja_add_test( NAME test-kernel-lambda-args SOURCES test-kernel-lambda-args.cpp) + +raja_add_test( + NAME test-vector + SOURCES test-vector.cpp) add_subdirectory(cpu) diff --git a/test/unit/test-vector.cpp b/test/unit/test-vector.cpp new file mode 100644 index 0000000000..6c41110cf0 --- /dev/null +++ b/test/unit/test-vector.cpp @@ -0,0 +1,227 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-19, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +/// +/// Source file containing tests for basic vector operations +/// + +#include "RAJA/RAJA.hpp" +#include "gtest/gtest.h" + +#include "RAJA/pattern/vector.hpp" + + + +using TestTypes = ::testing::Types, + RAJA::SimdRegister>; + + + +template +class VectorTest : public ::testing::Test +{ +protected: + + VectorTest() = default; + virtual ~VectorTest() = default; + + virtual void SetUp() + { + } + + virtual void TearDown() + { + } +}; +TYPED_TEST_CASE_P(VectorTest); + + +/* + * We are using drand48() for input values so the compiler cannot do fancy + * things, like constexpr out all of the intrinsics. + */ + +TYPED_TEST_P(VectorTest, SimdRegisterSetGet) +{ + + using register_t = TypeParam; + + static constexpr size_t num_elem = register_t::s_num_elem; + + double A[num_elem]; + register_t x; + for(size_t i = 0;i < num_elem; ++ i){ + A[i] = drand48(); + x.set(i, A[i]); + } + + for(size_t i = 0;i < num_elem; ++ i){ + ASSERT_EQ(x[i], A[i]); + } + +} + + + + +TYPED_TEST_P(VectorTest, SimdRegisterAdd) +{ + + using register_t = TypeParam; + + static constexpr size_t num_elem = register_t::s_num_elem; + + double A[num_elem], B[num_elem]; + register_t x, y; + + for(size_t i = 0;i < num_elem; ++ i){ + A[i] = drand48(); + B[i] = drand48(); + x.set(i, A[i]); + y.set(i, B[i]); + } + + register_t z = x+y; + + for(size_t i = 0;i < num_elem; ++ i){ + ASSERT_DOUBLE_EQ(z[i], A[i] + B[i]); + } + + register_t z2 = x; + z2 += y; + + for(size_t i = 0;i < num_elem; ++ i){ + ASSERT_DOUBLE_EQ(z2[i], A[i] + B[i]); + } + +} + +TYPED_TEST_P(VectorTest, SimdRegisterSubtract) +{ + + using register_t = TypeParam; + + static constexpr size_t num_elem = register_t::s_num_elem; + + double A[num_elem], B[num_elem]; + register_t x, y; + + for(size_t i = 0;i < num_elem; ++ i){ + A[i] = drand48(); + B[i] = drand48(); + x.set(i, A[i]); + y.set(i, B[i]); + } + + register_t z = x-y; + + for(size_t i = 0;i < num_elem; ++ i){ + ASSERT_DOUBLE_EQ(z[i], A[i] - B[i]); + } + + register_t z2 = x; + z2 -= y; + + for(size_t i = 0;i < num_elem; ++ i){ + ASSERT_DOUBLE_EQ(z2[i], A[i] - B[i]); + } +} + +TYPED_TEST_P(VectorTest, SimdRegisterMultiply) +{ + + using register_t = TypeParam; + + static constexpr size_t num_elem = register_t::s_num_elem; + + double A[num_elem], B[num_elem]; + register_t x, y; + + for(size_t i = 0;i < num_elem; ++ i){ + A[i] = drand48(); + B[i] = drand48(); + x.set(i, A[i]); + y.set(i, B[i]); + } + + register_t z = x*y; + + for(size_t i = 0;i < num_elem; ++ i){ + ASSERT_DOUBLE_EQ(z[i], A[i] * B[i]); + } + + register_t z2 = x; + z2 *= y; + + for(size_t i = 0;i < num_elem; ++ i){ + ASSERT_DOUBLE_EQ(z2[i], A[i] * B[i]); + } +} + +TYPED_TEST_P(VectorTest, SimdRegisterDivide) +{ + + using register_t = TypeParam; + + static constexpr size_t num_elem = register_t::s_num_elem; + + double A[num_elem], B[num_elem]; + register_t x, y; + + for(size_t i = 0;i < num_elem; ++ i){ + A[i] = drand48(); + B[i] = drand48()+1.0; + x.set(i, A[i]); + y.set(i, B[i]); + } + + register_t z = x/y; + + for(size_t i = 0;i < num_elem; ++ i){ + ASSERT_DOUBLE_EQ(z[i], A[i] / B[i]); + } + + register_t z2 = x; + z2 /= y; + + for(size_t i = 0;i < num_elem; ++ i){ + ASSERT_DOUBLE_EQ(z2[i], A[i] / B[i]); + } +} + +TYPED_TEST_P(VectorTest, SimdRegisterDotProduct) +{ + + using register_t = TypeParam; + + static constexpr size_t num_elem = register_t::s_num_elem; + + double A[num_elem], B[num_elem]; + register_t x, y; + + double expected = 0.0; + for(size_t i = 0;i < num_elem; ++ i){ + A[i] = drand48(); + B[i] = drand48(); + x.set(i, A[i]); + y.set(i, B[i]); + expected += A[i]*B[i]; + } + + ASSERT_DOUBLE_EQ(x.dot(y), expected); + +} + + +REGISTER_TYPED_TEST_CASE_P(VectorTest, SimdRegisterSetGet, + SimdRegisterAdd, + SimdRegisterSubtract, + SimdRegisterMultiply, + SimdRegisterDivide, + SimdRegisterDotProduct); + +INSTANTIATE_TYPED_TEST_CASE_P(SIMD, VectorTest, TestTypes); From a9b2ed53920c8744d58cc8c6462b6bb25c73146a Mon Sep 17 00:00:00 2001 From: "Adam J. Kunen" Date: Tue, 5 Nov 2019 15:34:29 -0800 Subject: [PATCH 002/593] Implemented working 1,2,3,4-wide double vectors, and typed unit testing --- include/RAJA/pattern/vector.hpp | 276 ++++----------- include/RAJA/policy/simd/register/double2.hpp | 309 +++++++++++++++++ include/RAJA/policy/simd/register/double3.hpp | 328 ++++++++++++++++++ include/RAJA/policy/simd/register/double4.hpp | 323 +++++++++++++++++ test/unit/test-vector.cpp | 158 +++++++-- 5 files changed, 1147 insertions(+), 247 deletions(-) create mode 100644 include/RAJA/policy/simd/register/double2.hpp create mode 100644 include/RAJA/policy/simd/register/double3.hpp create mode 100644 include/RAJA/policy/simd/register/double4.hpp diff --git a/include/RAJA/pattern/vector.hpp b/include/RAJA/pattern/vector.hpp index c2323f985c..e685929df1 100644 --- a/include/RAJA/pattern/vector.hpp +++ b/include/RAJA/pattern/vector.hpp @@ -22,10 +22,6 @@ #include "RAJA/util/macros.hpp" - -// Include SIMD intrinsics header file -#include - namespace RAJA { @@ -40,33 +36,37 @@ namespace RAJA template class SimdRegister; - template<> - class SimdRegister{ + + /** + * A specialization for a single element SIMD register. + * We will implement this as a scalar value, and let the compiler use + * whatever registers it deems appropriate. + */ + template + class SimdRegister{ public: - using self_type = SimdRegister; - using element_type = double; + using self_type = SimdRegister; + using element_type = T; - static constexpr size_t s_num_elem = 4; - static constexpr size_t s_byte_width = s_num_elem*sizeof(double); + static constexpr size_t s_num_elem = 1; + static constexpr size_t s_byte_width = sizeof(T); static constexpr size_t s_bit_width = s_byte_width*8; - using simd_type = __m256d; - private: - simd_type m_value; + T m_value; public: /*! * @brief Default constructor, zeros register contents */ - SimdRegister() : m_value(_mm256_setzero_pd()) { + SimdRegister() : m_value(0) { } /*! * @brief Copy constructor from underlying simd register */ - SimdRegister(simd_type const &c) : m_value(c) {} + SimdRegister(T const &c) : m_value(c) {} /*! @@ -74,205 +74,49 @@ namespace RAJA */ SimdRegister(self_type const &c) : m_value(c.m_value) {} - /*! - * @brief Get scalar value from vector register - * @param i Offset of scalar to get - * @return Returns scalar value at i - */ - template - constexpr - RAJA_INLINE - element_type operator[](IDX i) const - {return m_value[i];} - - - /*! - * @brief Set scalar value in vector register - * @param i Offset of scalar to set - * @param value Value of scalar to set - */ - template - RAJA_INLINE - void set(IDX i, element_type value) - {m_value[i] = value;} - - /*! - * @brief Set entire vector to a single scalar value - * @param value Value to set all vector elements to - */ - RAJA_INLINE - self_type const &operator=(element_type value) - { - m_value = _mm256_set1_pd(value); - return *this; - } - - /*! - * @brief Assign one register to antoher - * @param x Vector to copy - * @return Value of (*this) - */ - RAJA_INLINE - self_type const &operator=(self_type const &x) - { - m_value = x.m_value; - return *this; - } - - - /*! - * @brief Add two vector registers - * @param x Vector to add to this register - * @return Value of (*this)+x - */ - RAJA_INLINE - self_type operator+(self_type const &x) const - { - return self_type(_mm256_add_pd(m_value, x.m_value)); - } - - /*! - * @brief Add a vector to this vector - * @param x Vector to add to this register - * @return Value of (*this)+x - */ - RAJA_INLINE - self_type const &operator+=(self_type const &x) - { - m_value = _mm256_add_pd(m_value, x.m_value); - return *this; - } - - /*! - * @brief Subtract two vector registers - * @param x Vector to subctract from this register - * @return Value of (*this)+x - */ - RAJA_INLINE - self_type operator-(self_type const &x) const - { - return self_type(_mm256_sub_pd(m_value, x.m_value)); - } - - /*! - * @brief Subtract a vector from this vector - * @param x Vector to subtract from this register - * @return Value of (*this)+x - */ - RAJA_INLINE - self_type const &operator-=(self_type const &x) - { - m_value = _mm256_sub_pd(m_value, x.m_value); - return *this; - } - - /*! - * @brief Multiply two vector registers, element wise - * @param x Vector to subctract from this register - * @return Value of (*this)+x - */ - RAJA_INLINE - self_type operator*(self_type const &x) const - { - return self_type(_mm256_mul_pd(m_value, x.m_value)); - } /*! - * @brief Multiply a vector with this vector - * @param x Vector to multiple with this register - * @return Value of (*this)+x + * @brief Load constructor, assuming scalars are in consecutive memory + * locations. */ - RAJA_INLINE - self_type const &operator*=(self_type const &x) - { - m_value = _mm256_mul_pd(m_value, x.m_value); - return *this; + void load(element_type const *ptr){ + m_value = ptr[0]; } /*! - * @brief Divide two vector registers, element wise - * @param x Vector to subctract from this register - * @return Value of (*this)+x + * @brief Strided load constructor, when scalars are located in memory + * locations ptr, ptr+stride, ptr+2*stride, etc. + * + * + * Note: this could be done with "gather" instructions if they are + * available. (like in avx2, but not in avx) */ - RAJA_INLINE - self_type operator/(self_type const &x) const - { - return self_type(_mm256_div_pd(m_value, x.m_value)); + void load(element_type const *ptr, size_t ){ + m_value = ptr[0]; } - /*! - * @brief Divide this vector by another vector - * @param x Vector to divide by - * @return Value of (*this)+x - */ - RAJA_INLINE - self_type const &operator/=(self_type const &x) - { - m_value = _mm256_div_pd(m_value, x.m_value); - return *this; - } /*! - * @brief Sum the elements of this vector - * @return Sum of the values of the vectors scalar elements + * @brief Store operation, assuming scalars are in consecutive memory + * locations. */ - RAJA_INLINE - element_type sum() const - { - auto hsum = _mm256_hadd_pd(m_value, m_value); - return hsum[0] + hsum[2]; + void store(element_type *ptr) const{ + ptr[0] = m_value; } /*! - * @brief Dot product of two vectors - * @param x Other vector to dot with this vector - * @return Value of (*this) dot x + * @brief Strided store operation, where scalars are stored in memory + * locations ptr, ptr+stride, ptr+2*stride, etc. + * + * + * Note: this could be done with "scatter" instructions if they are + * available. */ - RAJA_INLINE - element_type dot(self_type const &x) const - { - return self_type(_mm256_mul_pd(m_value, x.m_value)).sum(); + void store(element_type *ptr, size_t) const{ + ptr[0] = m_value; } - }; - - - /** - * A specialization for a single element SIMD register. - * We will implement this as a scalar value, and let the compiler use - * whatever registers it deems appropriate. - */ - template - class SimdRegister{ - public: - using self_type = SimdRegister; - using element_type = T; - - static constexpr size_t s_num_elem = 1; - static constexpr size_t s_byte_width = sizeof(T); - static constexpr size_t s_bit_width = s_byte_width*8; - - private: - T m_value; - - public: - - /*! - * @brief Default constructor, zeros register contents - */ - SimdRegister() : m_value(0) { - } - - /*! - * @brief Copy constructor from underlying simd register - */ - SimdRegister(T const &c) : m_value(c) {} - /*! - * @brief Copy constructor - */ - SimdRegister(self_type const &c) : m_value(c.m_value) {} - /*! * @brief Get scalar value from vector register * @param i Offset of scalar to get @@ -431,31 +275,35 @@ namespace RAJA { return m_value*x.m_value; } - }; + /*! + * @brief Returns the largest element + * @return The largest scalar element in the register + */ + RAJA_INLINE + element_type max() const + { + return m_value; + } -// template -// struct FixedVector { -// public: -// using self_type = FixedVector; -// using element_type = typename REGISTER::element_type; -// -// static constexpr size_t s_bit_width = 256; -// static constexpr size_t s_byte_width = s_bit_width/8; -// static constexpr size_t s_num_elem = s_byte_width / sizeof(double); -// -// using register_type = REGISTER; -// -// private: -// register_type m_values[s_num_registers]; -// -// public: -// }; + /*! + * @brief Returns the largest element + * @return The largest scalar element in the register + */ + RAJA_INLINE + element_type min() const + { + return m_value; + } + }; +} // namespace RAJA +#include +#include +#include -} // namespace RAJA #endif diff --git a/include/RAJA/policy/simd/register/double2.hpp b/include/RAJA/policy/simd/register/double2.hpp new file mode 100644 index 0000000000..a5fdb5de0b --- /dev/null +++ b/include/RAJA/policy/simd/register/double2.hpp @@ -0,0 +1,309 @@ +/*! + ****************************************************************************** + * + * \file + * + * \brief RAJA header file defining vector operations. + * + ****************************************************************************** + */ + +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-19, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#ifndef RAJA_policy_simd_register_double2_HPP +#define RAJA_policy_simd_register_double2_HPP + +#include "RAJA/config.hpp" +#include "RAJA/util/macros.hpp" + +// Include SIMD intrinsics header file +#include +#include + + +namespace RAJA +{ + + + template<> + class SimdRegister{ + public: + using self_type = SimdRegister; + using element_type = double; + + static constexpr size_t s_num_elem = 2; + static constexpr size_t s_byte_width = s_num_elem*sizeof(double); + static constexpr size_t s_bit_width = s_byte_width*8; + + using simd_type = __m128d; + + private: + simd_type m_value; + + public: + + /*! + * @brief Default constructor, zeros register contents + */ + SimdRegister() : m_value(_mm_setzero_pd()) { + } + + /*! + * @brief Copy constructor from underlying simd register + */ + explicit SimdRegister(simd_type const &c) : m_value(c) {} + + + /*! + * @brief Copy constructor + */ + SimdRegister(self_type const &c) : m_value(c.m_value) {} + + /*! + * @brief Load operation, assuming scalars are in consecutive memory + * locations. + */ + void load(element_type const *ptr){ + m_value = _mm_loadu_pd(ptr); + } + + /*! + * @brief Strided load operation, when scalars are located in memory + * locations ptr, ptr+stride + * + * + * Note: this could be done with "gather" instructions if they are + * available. (like in avx2, but not in avx) + */ + void load(element_type const *ptr, size_t stride){ + m_value = _mm_set_pd(ptr[stride], ptr[0]); + } + + + /*! + * @brief Store operation, assuming scalars are in consecutive memory + * locations. + */ + void store(element_type *ptr) const{ + _mm_storeu_pd(ptr, m_value); + } + + /*! + * @brief Strided store operation, where scalars are stored in memory + * locations ptr, ptr+stride, ptr+2*stride, etc. + * + * + * Note: this could be done with "scatter" instructions if they are + * available. + */ + void store(element_type *ptr, size_t stride) const{ + ptr[0] = m_value[0]; + ptr[stride] = m_value[1]; + } + + /*! + * @brief Get scalar value from vector register + * @param i Offset of scalar to get + * @return Returns scalar value at i + */ + template + constexpr + RAJA_INLINE + element_type operator[](IDX i) const + {return m_value[i];} + + + /*! + * @brief Set scalar value in vector register + * @param i Offset of scalar to set + * @param value Value of scalar to set + */ + template + RAJA_INLINE + void set(IDX i, element_type value) + {m_value[i] = value;} + + /*! + * @brief Set entire vector to a single scalar value + * @param value Value to set all vector elements to + */ + RAJA_INLINE + self_type const &operator=(element_type value) + { + m_value = _mm_set1_pd(value); + return *this; + } + + /*! + * @brief Assign one register to antoher + * @param x Vector to copy + * @return Value of (*this) + */ + RAJA_INLINE + self_type const &operator=(self_type const &x) + { + m_value = x.m_value; + return *this; + } + + + /*! + * @brief Add two vector registers + * @param x Vector to add to this register + * @return Value of (*this)+x + */ + RAJA_INLINE + self_type operator+(self_type const &x) const + { + return self_type(_mm_add_pd(m_value, x.m_value)); + } + + /*! + * @brief Add a vector to this vector + * @param x Vector to add to this register + * @return Value of (*this)+x + */ + RAJA_INLINE + self_type const &operator+=(self_type const &x) + { + m_value = _mm_add_pd(m_value, x.m_value); + return *this; + } + + /*! + * @brief Subtract two vector registers + * @param x Vector to subctract from this register + * @return Value of (*this)+x + */ + RAJA_INLINE + self_type operator-(self_type const &x) const + { + return self_type(_mm_sub_pd(m_value, x.m_value)); + } + + /*! + * @brief Subtract a vector from this vector + * @param x Vector to subtract from this register + * @return Value of (*this)+x + */ + RAJA_INLINE + self_type const &operator-=(self_type const &x) + { + m_value = _mm_sub_pd(m_value, x.m_value); + return *this; + } + + /*! + * @brief Multiply two vector registers, element wise + * @param x Vector to subctract from this register + * @return Value of (*this)+x + */ + RAJA_INLINE + self_type operator*(self_type const &x) const + { + return self_type(_mm_mul_pd(m_value, x.m_value)); + } + + /*! + * @brief Multiply a vector with this vector + * @param x Vector to multiple with this register + * @return Value of (*this)+x + */ + RAJA_INLINE + self_type const &operator*=(self_type const &x) + { + m_value = _mm_mul_pd(m_value, x.m_value); + return *this; + } + + /*! + * @brief Divide two vector registers, element wise + * @param x Vector to subctract from this register + * @return Value of (*this)+x + */ + RAJA_INLINE + self_type operator/(self_type const &x) const + { + return self_type(_mm_div_pd(m_value, x.m_value)); + } + + /*! + * @brief Divide this vector by another vector + * @param x Vector to divide by + * @return Value of (*this)+x + */ + RAJA_INLINE + self_type const &operator/=(self_type const &x) + { + m_value = _mm_div_pd(m_value, x.m_value); + return *this; + } + + /*! + * @brief Sum the elements of this vector + * @return Sum of the values of the vectors scalar elements + */ + RAJA_INLINE + element_type sum() const + { + auto hsum = _mm_hadd_pd(m_value, m_value); + return hsum[0]; + } + + /*! + * @brief Dot product of two vectors + * @param x Other vector to dot with this vector + * @return Value of (*this) dot x + */ + RAJA_INLINE + element_type dot(self_type const &x) const + { + return self_type(_mm_mul_pd(m_value, x.m_value)).sum(); + } + + /*! + * @brief Returns the largest element + * @return The largest scalar element in the register + */ + RAJA_INLINE + element_type max() const + { + // swap the two lanes + simd_type a = _mm_permute_pd(m_value, 0x01); + + // take the max of each lane (should be same result in each lane) + simd_type b = _mm_max_pd(m_value, a); + + // return the lower lane + return b[0]; + } + + /*! + * @brief Returns the largest element + * @return The largest scalar element in the register + */ + RAJA_INLINE + element_type min() const + { + // swap the two lanes + simd_type a = _mm_permute_pd(m_value, 0x01); + + // take the max of each lane (should be same result in each lane) + simd_type b = _mm_min_pd(m_value, a); + + // return the lower lane + return b[0]; + } + }; + + + +} // namespace RAJA + + +#endif diff --git a/include/RAJA/policy/simd/register/double3.hpp b/include/RAJA/policy/simd/register/double3.hpp new file mode 100644 index 0000000000..2fdea6ed7c --- /dev/null +++ b/include/RAJA/policy/simd/register/double3.hpp @@ -0,0 +1,328 @@ +/*! + ****************************************************************************** + * + * \file + * + * \brief RAJA header file defining vector operations. + * + ****************************************************************************** + */ + +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-19, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#ifndef RAJA_policy_simd_register_double3_HPP +#define RAJA_policy_simd_register_double3_HPP + +#include "RAJA/config.hpp" +#include "RAJA/util/macros.hpp" + +// Include SIMD intrinsics header file +#include +#include + + +namespace RAJA +{ + + + template<> + class SimdRegister{ + public: + using self_type = SimdRegister; + using element_type = double; + + static constexpr size_t s_num_elem = 3; + static constexpr size_t s_byte_width = s_num_elem*sizeof(double); + static constexpr size_t s_bit_width = s_byte_width*8; + + // Using a 256-bit (4 double) vector, but padding out the upper most + // value + using simd_type = __m256d; + + + private: + simd_type m_value; + + // Mask used to mask off the upper double from the vector + using mask_type = __m256i; + static constexpr mask_type s_mask = (__m256i)(__v4di){ -1, -1, -1, 0}; + + public: + + /*! + * @brief Default constructor, zeros register contents + */ + SimdRegister() : m_value(_mm256_setzero_pd()) { + } + + /*! + * @brief Copy constructor from underlying simd register + */ + explicit SimdRegister(simd_type const &c) : m_value(c) {} + + + /*! + * @brief Copy constructor + */ + SimdRegister(self_type const &c) : m_value(c.m_value) {} + + /*! + * @brief Load constructor, assuming scalars are in consecutive memory + * locations. + */ + void load(element_type const *ptr){ + m_value = _mm256_maskload_pd(ptr, s_mask); + } + + /*! + * @brief Strided load constructor, when scalars are located in memory + * locations ptr, ptr+stride, ptr+2*stride, etc. + * + * + * Note: this could be done with "gather" instructions if they are + * available. (like in avx2, but not in avx) + */ + void load(element_type const *ptr, size_t stride){ + m_value =_mm256_set_pd(0.0, + ptr[2*stride], + ptr[stride], + ptr[0]); + } + + + /*! + * @brief Store operation, assuming scalars are in consecutive memory + * locations. + */ + void store(element_type *ptr) const{ + _mm256_maskstore_pd(ptr, m_value, s_mask); + } + + /*! + * @brief Strided store operation, where scalars are stored in memory + * locations ptr, ptr+stride, ptr+2*stride, etc. + * + * + * Note: this could be done with "scatter" instructions if they are + * available. + */ + void store(element_type *ptr, size_t stride) const{ + for(size_t i = 0;i < s_num_elem;++ i){ + ptr[i*stride] = m_value[i]; + } + } + + /*! + * @brief Get scalar value from vector register + * @param i Offset of scalar to get + * @return Returns scalar value at i + */ + template + constexpr + RAJA_INLINE + element_type operator[](IDX i) const + {return m_value[i];} + + + /*! + * @brief Set scalar value in vector register + * @param i Offset of scalar to set + * @param value Value of scalar to set + */ + template + RAJA_INLINE + void set(IDX i, element_type value) + {m_value[i] = value;} + + /*! + * @brief Set entire vector to a single scalar value + * @param value Value to set all vector elements to + */ + RAJA_INLINE + self_type const &operator=(element_type value) + { + m_value = _mm256_set1_pd(value); + return *this; + } + + /*! + * @brief Assign one register to antoher + * @param x Vector to copy + * @return Value of (*this) + */ + RAJA_INLINE + self_type const &operator=(self_type const &x) + { + m_value = x.m_value; + return *this; + } + + + /*! + * @brief Add two vector registers + * @param x Vector to add to this register + * @return Value of (*this)+x + */ + RAJA_INLINE + self_type operator+(self_type const &x) const + { + return self_type(_mm256_add_pd(m_value, x.m_value)); + } + + /*! + * @brief Add a vector to this vector + * @param x Vector to add to this register + * @return Value of (*this)+x + */ + RAJA_INLINE + self_type const &operator+=(self_type const &x) + { + m_value = _mm256_add_pd(m_value, x.m_value); + return *this; + } + + /*! + * @brief Subtract two vector registers + * @param x Vector to subctract from this register + * @return Value of (*this)+x + */ + RAJA_INLINE + self_type operator-(self_type const &x) const + { + return self_type(_mm256_sub_pd(m_value, x.m_value)); + } + + /*! + * @brief Subtract a vector from this vector + * @param x Vector to subtract from this register + * @return Value of (*this)+x + */ + RAJA_INLINE + self_type const &operator-=(self_type const &x) + { + m_value = _mm256_sub_pd(m_value, x.m_value); + return *this; + } + + /*! + * @brief Multiply two vector registers, element wise + * @param x Vector to subctract from this register + * @return Value of (*this)+x + */ + RAJA_INLINE + self_type operator*(self_type const &x) const + { + return self_type(_mm256_mul_pd(m_value, x.m_value)); + } + + /*! + * @brief Multiply a vector with this vector + * @param x Vector to multiple with this register + * @return Value of (*this)+x + */ + RAJA_INLINE + self_type const &operator*=(self_type const &x) + { + m_value = _mm256_mul_pd(m_value, x.m_value); + return *this; + } + + /*! + * @brief Divide two vector registers, element wise + * @param x Vector to subctract from this register + * @return Value of (*this)+x + */ + RAJA_INLINE + self_type operator/(self_type const &x) const + { + return self_type(_mm256_div_pd(m_value, x.m_value)); + } + + /*! + * @brief Divide this vector by another vector + * @param x Vector to divide by + * @return Value of (*this)+x + */ + RAJA_INLINE + self_type const &operator/=(self_type const &x) + { + m_value = _mm256_div_pd(m_value, x.m_value); + return *this; + } + + /*! + * @brief Sum the elements of this vector + * @return Sum of the values of the vectors scalar elements + */ + RAJA_INLINE + element_type sum() const + { + auto hsum = _mm256_hadd_pd(m_value, m_value); + return hsum[0] + m_value[2]; + } + + /*! + * @brief Dot product of two vectors + * @param x Other vector to dot with this vector + * @return Value of (*this) dot x + */ + RAJA_INLINE + element_type dot(self_type const &x) const + { + return self_type(_mm256_mul_pd(m_value, x.m_value)).sum(); + } + + /*! + * @brief Returns the largest element + * @return The largest scalar element in the register + */ + RAJA_INLINE + element_type max() const + { + // permute the first two and last two lanes of the register + simd_type a = _mm256_shuffle_pd(m_value, m_value, 0x01); + + // take the minimum value of each lane + // this gives us b=XXYY where + // X = min(a[0], a[1]) + // Y = min(a[2], a[3]) + simd_type b = _mm256_max_pd(m_value, a); + + // now take the minimum of a lower and upper lane + return std::max(b[0], b[2]); + } + + /*! + * @brief Returns the largest element + * @return The largest scalar element in the register + */ + RAJA_INLINE + element_type min() const + { + // permute the first two and last two lanes of the register + // m_value = ABCD + // a = AACC + simd_type a = _mm256_shuffle_pd(m_value, m_value, 0x01); + + // take the minimum value of each lane + // this gives us b=XXYY where + // X = min(a[0], a[1]) + // Y = min(a[2], a[3]) + simd_type b = _mm256_min_pd(m_value, a); + + // now take the minimum of a lower and upper lane + return std::min(b[0], b[2]); + } + }; + + + +} // namespace RAJA + + +#endif diff --git a/include/RAJA/policy/simd/register/double4.hpp b/include/RAJA/policy/simd/register/double4.hpp new file mode 100644 index 0000000000..94803082f3 --- /dev/null +++ b/include/RAJA/policy/simd/register/double4.hpp @@ -0,0 +1,323 @@ +/*! + ****************************************************************************** + * + * \file + * + * \brief RAJA header file defining vector operations. + * + ****************************************************************************** + */ + +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-19, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#ifndef RAJA_policy_simd_register_double4_HPP +#define RAJA_policy_simd_register_double4_HPP + +#include "RAJA/config.hpp" +#include "RAJA/util/macros.hpp" + +// Include SIMD intrinsics header file +#include +#include + + +namespace RAJA +{ + + + template<> + class SimdRegister{ + public: + using self_type = SimdRegister; + using element_type = double; + + static constexpr size_t s_num_elem = 4; + static constexpr size_t s_byte_width = s_num_elem*sizeof(double); + static constexpr size_t s_bit_width = s_byte_width*8; + + using simd_type = __m256d; + + private: + simd_type m_value; + + public: + + /*! + * @brief Default constructor, zeros register contents + */ + SimdRegister() : m_value(_mm256_setzero_pd()) { + } + + /*! + * @brief Copy constructor from underlying simd register + */ + explicit SimdRegister(simd_type const &c) : m_value(c) {} + + + /*! + * @brief Copy constructor + */ + SimdRegister(self_type const &c) : m_value(c.m_value) {} + + + /*! + * @brief Load constructor, assuming scalars are in consecutive memory + * locations. + */ + void load(element_type const *ptr){ + m_value = _mm256_loadu_pd(ptr); + } + + /*! + * @brief Strided load constructor, when scalars are located in memory + * locations ptr, ptr+stride, ptr+2*stride, etc. + * + * + * Note: this could be done with "gather" instructions if they are + * available. (like in avx2, but not in avx) + */ + void load(element_type const *ptr, size_t stride){ + m_value =_mm256_set_pd(ptr[3*stride], + ptr[2*stride], + ptr[stride], + ptr[0]); + } + + + + /*! + * @brief Store operation, assuming scalars are in consecutive memory + * locations. + */ + void store(element_type *ptr) const{ + _mm256_storeu_pd(ptr, m_value); + } + + /*! + * @brief Strided store operation, where scalars are stored in memory + * locations ptr, ptr+stride, ptr+2*stride, etc. + * + * + * Note: this could be done with "scatter" instructions if they are + * available. + */ + void store(element_type *ptr, size_t stride) const{ + for(size_t i = 0;i < s_num_elem;++ i){ + ptr[i*stride] = m_value[i]; + } + } + + /*! + * @brief Get scalar value from vector register + * @param i Offset of scalar to get + * @return Returns scalar value at i + */ + template + constexpr + RAJA_INLINE + element_type operator[](IDX i) const + {return m_value[i];} + + + /*! + * @brief Set scalar value in vector register + * @param i Offset of scalar to set + * @param value Value of scalar to set + */ + template + RAJA_INLINE + void set(IDX i, element_type value) + {m_value[i] = value;} + + /*! + * @brief Set entire vector to a single scalar value + * @param value Value to set all vector elements to + */ + RAJA_INLINE + self_type const &operator=(element_type value) + { + m_value = _mm256_set1_pd(value); + return *this; + } + + /*! + * @brief Assign one register to antoher + * @param x Vector to copy + * @return Value of (*this) + */ + RAJA_INLINE + self_type const &operator=(self_type const &x) + { + m_value = x.m_value; + return *this; + } + + + /*! + * @brief Add two vector registers + * @param x Vector to add to this register + * @return Value of (*this)+x + */ + RAJA_INLINE + self_type operator+(self_type const &x) const + { + return self_type(_mm256_add_pd(m_value, x.m_value)); + } + + /*! + * @brief Add a vector to this vector + * @param x Vector to add to this register + * @return Value of (*this)+x + */ + RAJA_INLINE + self_type const &operator+=(self_type const &x) + { + m_value = _mm256_add_pd(m_value, x.m_value); + return *this; + } + + /*! + * @brief Subtract two vector registers + * @param x Vector to subctract from this register + * @return Value of (*this)+x + */ + RAJA_INLINE + self_type operator-(self_type const &x) const + { + return self_type(_mm256_sub_pd(m_value, x.m_value)); + } + + /*! + * @brief Subtract a vector from this vector + * @param x Vector to subtract from this register + * @return Value of (*this)+x + */ + RAJA_INLINE + self_type const &operator-=(self_type const &x) + { + m_value = _mm256_sub_pd(m_value, x.m_value); + return *this; + } + + /*! + * @brief Multiply two vector registers, element wise + * @param x Vector to subctract from this register + * @return Value of (*this)+x + */ + RAJA_INLINE + self_type operator*(self_type const &x) const + { + return self_type(_mm256_mul_pd(m_value, x.m_value)); + } + + /*! + * @brief Multiply a vector with this vector + * @param x Vector to multiple with this register + * @return Value of (*this)+x + */ + RAJA_INLINE + self_type const &operator*=(self_type const &x) + { + m_value = _mm256_mul_pd(m_value, x.m_value); + return *this; + } + + /*! + * @brief Divide two vector registers, element wise + * @param x Vector to subctract from this register + * @return Value of (*this)+x + */ + RAJA_INLINE + self_type operator/(self_type const &x) const + { + return self_type(_mm256_div_pd(m_value, x.m_value)); + } + + /*! + * @brief Divide this vector by another vector + * @param x Vector to divide by + * @return Value of (*this)+x + */ + RAJA_INLINE + self_type const &operator/=(self_type const &x) + { + m_value = _mm256_div_pd(m_value, x.m_value); + return *this; + } + + /*! + * @brief Sum the elements of this vector + * @return Sum of the values of the vectors scalar elements + */ + RAJA_INLINE + element_type sum() const + { + auto hsum = _mm256_hadd_pd(m_value, m_value); + return hsum[0] + hsum[2]; + } + + /*! + * @brief Dot product of two vectors + * @param x Other vector to dot with this vector + * @return Value of (*this) dot x + */ + RAJA_INLINE + element_type dot(self_type const &x) const + { + return self_type(_mm256_mul_pd(m_value, x.m_value)).sum(); + } + + /*! + * @brief Returns the largest element + * @return The largest scalar element in the register + */ + RAJA_INLINE + element_type max() const + { + // permute the first two and last two lanes of the register + simd_type a = _mm256_shuffle_pd(m_value, m_value, 0x05); + + // take the minimum value of each lane + // this gives us b=XXYY where + // X = min(a[0], a[1]) + // Y = min(a[2], a[3]) + simd_type b = _mm256_max_pd(m_value, a); + + // now take the minimum of a lower and upper lane + return std::max(b[0], b[2]); + } + + /*! + * @brief Returns the largest element + * @return The largest scalar element in the register + */ + RAJA_INLINE + element_type min() const + { + // permute the first two and last two lanes of the register + // m_value = ABCD + // a = AACC + simd_type a = _mm256_shuffle_pd(m_value, m_value, 0x05); + + // take the minimum value of each lane + // this gives us b=XXYY where + // X = min(a[0], a[1]) + // Y = min(a[2], a[3]) + simd_type b = _mm256_min_pd(m_value, a); + + // now take the minimum of a lower and upper lane + return std::min(b[0], b[2]); + } + }; + + + +} // namespace RAJA + + +#endif diff --git a/test/unit/test-vector.cpp b/test/unit/test-vector.cpp index 6c41110cf0..96c68fa220 100644 --- a/test/unit/test-vector.cpp +++ b/test/unit/test-vector.cpp @@ -16,18 +16,22 @@ -using TestTypes = ::testing::Types, +using TestTypes = ::testing::Types, + RAJA::SimdRegister, + RAJA::SimdRegister, + RAJA::SimdRegister, + RAJA::SimdRegister, RAJA::SimdRegister>; template -class VectorTest : public ::testing::Test +class RegisterTest : public ::testing::Test { protected: - VectorTest() = default; - virtual ~VectorTest() = default; + RegisterTest() = default; + virtual ~RegisterTest() = default; virtual void SetUp() { @@ -37,7 +41,7 @@ class VectorTest : public ::testing::Test { } }; -TYPED_TEST_CASE_P(VectorTest); +TYPED_TEST_CASE_P(RegisterTest); /* @@ -45,42 +49,75 @@ TYPED_TEST_CASE_P(VectorTest); * things, like constexpr out all of the intrinsics. */ -TYPED_TEST_P(VectorTest, SimdRegisterSetGet) +TYPED_TEST_P(RegisterTest, SimdRegisterSetGet) { using register_t = TypeParam; + using element_t = typename register_t::element_type; static constexpr size_t num_elem = register_t::s_num_elem; - double A[num_elem]; + element_t A[num_elem]; register_t x; for(size_t i = 0;i < num_elem; ++ i){ - A[i] = drand48(); + A[i] = (element_t)(drand48()*1000.0); x.set(i, A[i]); } for(size_t i = 0;i < num_elem; ++ i){ - ASSERT_EQ(x[i], A[i]); + ASSERT_DOUBLE_EQ(x[i], A[i]); } } +TYPED_TEST_P(RegisterTest, SimdRegisterLoad) +{ + + using register_t = TypeParam; + + using element_t = typename register_t::element_type; + static constexpr size_t num_elem = register_t::s_num_elem; + + element_t A[num_elem*2]; + for(size_t i = 0;i < num_elem*2; ++ i){ + A[i] = (element_t)(drand48()*1000.0); + } + + + // load stride-1 from pointer + register_t x; + x.load(A); + + for(size_t i = 0;i < num_elem; ++ i){ + ASSERT_DOUBLE_EQ(x[i], A[i]); + } + // load stride-2from pointer + register_t y; + y.load(A, 2); -TYPED_TEST_P(VectorTest, SimdRegisterAdd) + for(size_t i = 0;i < num_elem; ++ i){ + ASSERT_DOUBLE_EQ(y[i], A[i*2]); + } +} + + + +TYPED_TEST_P(RegisterTest, SimdRegisterAdd) { using register_t = TypeParam; + using element_t = typename register_t::element_type; static constexpr size_t num_elem = register_t::s_num_elem; - double A[num_elem], B[num_elem]; + element_t A[num_elem], B[num_elem]; register_t x, y; for(size_t i = 0;i < num_elem; ++ i){ - A[i] = drand48(); - B[i] = drand48(); + A[i] = (element_t)(drand48()*1000.0); + B[i] = (element_t)(drand48()*1000.0); x.set(i, A[i]); y.set(i, B[i]); } @@ -100,19 +137,20 @@ TYPED_TEST_P(VectorTest, SimdRegisterAdd) } -TYPED_TEST_P(VectorTest, SimdRegisterSubtract) +TYPED_TEST_P(RegisterTest, SimdRegisterSubtract) { using register_t = TypeParam; + using element_t = typename register_t::element_type; static constexpr size_t num_elem = register_t::s_num_elem; - double A[num_elem], B[num_elem]; + element_t A[num_elem], B[num_elem]; register_t x, y; for(size_t i = 0;i < num_elem; ++ i){ - A[i] = drand48(); - B[i] = drand48(); + A[i] = (element_t)(drand48()*1000.0); + B[i] = (element_t)(drand48()*1000.0); x.set(i, A[i]); y.set(i, B[i]); } @@ -131,19 +169,20 @@ TYPED_TEST_P(VectorTest, SimdRegisterSubtract) } } -TYPED_TEST_P(VectorTest, SimdRegisterMultiply) +TYPED_TEST_P(RegisterTest, SimdRegisterMultiply) { using register_t = TypeParam; + using element_t = typename register_t::element_type; static constexpr size_t num_elem = register_t::s_num_elem; - double A[num_elem], B[num_elem]; + element_t A[num_elem], B[num_elem]; register_t x, y; for(size_t i = 0;i < num_elem; ++ i){ - A[i] = drand48(); - B[i] = drand48(); + A[i] = (element_t)(drand48()*1000.0); + B[i] = (element_t)(drand48()*1000.0); x.set(i, A[i]); y.set(i, B[i]); } @@ -162,19 +201,20 @@ TYPED_TEST_P(VectorTest, SimdRegisterMultiply) } } -TYPED_TEST_P(VectorTest, SimdRegisterDivide) +TYPED_TEST_P(RegisterTest, SimdRegisterDivide) { using register_t = TypeParam; + using element_t = typename register_t::element_type; static constexpr size_t num_elem = register_t::s_num_elem; - double A[num_elem], B[num_elem]; + element_t A[num_elem], B[num_elem]; register_t x, y; for(size_t i = 0;i < num_elem; ++ i){ - A[i] = drand48(); - B[i] = drand48()+1.0; + A[i] = (element_t)(drand48()*1000.0); + B[i] = (element_t)(drand48()*1000.0)+1.0; x.set(i, A[i]); y.set(i, B[i]); } @@ -193,20 +233,21 @@ TYPED_TEST_P(VectorTest, SimdRegisterDivide) } } -TYPED_TEST_P(VectorTest, SimdRegisterDotProduct) +TYPED_TEST_P(RegisterTest, SimdRegisterDotProduct) { using register_t = TypeParam; + using element_t = typename register_t::element_type; static constexpr size_t num_elem = register_t::s_num_elem; - double A[num_elem], B[num_elem]; + element_t A[num_elem], B[num_elem]; register_t x, y; - double expected = 0.0; + element_t expected = 0.0; for(size_t i = 0;i < num_elem; ++ i){ - A[i] = drand48(); - B[i] = drand48(); + A[i] = (element_t)(drand48()*1000.0); + B[i] = (element_t)(drand48()*1000.0); x.set(i, A[i]); y.set(i, B[i]); expected += A[i]*B[i]; @@ -216,12 +257,63 @@ TYPED_TEST_P(VectorTest, SimdRegisterDotProduct) } +TYPED_TEST_P(RegisterTest, SimdRegisterMax) +{ + using register_t = TypeParam; + + using element_t = typename register_t::element_type; + static constexpr size_t num_elem = register_t::s_num_elem; + + element_t A[num_elem]; + register_t x; + + for(size_t i = 0;i < num_elem; ++ i){ + A[i] = (element_t)(drand48()*1000.0); + x.set(i, A[i]); + } + + element_t expected = A[0]; + for(size_t i = 1;i < num_elem;++ i){ + expected = expected > A[i] ? expected : A[i]; + } + + ASSERT_DOUBLE_EQ(x.max(), expected); + +} + +TYPED_TEST_P(RegisterTest, SimdRegisterMin) +{ + using register_t = TypeParam; + + using element_t = typename register_t::element_type; + static constexpr size_t num_elem = register_t::s_num_elem; + + element_t A[num_elem]; + register_t x; + + for(size_t i = 0;i < num_elem; ++ i){ + A[i] = (element_t)(drand48()*1000.0); + x.set(i, A[i]); + } + + element_t expected = A[0]; + for(size_t i = 1;i < num_elem;++ i){ + expected = expected < A[i] ? expected : A[i]; + } + + ASSERT_DOUBLE_EQ(x.min(), expected); + +} + -REGISTER_TYPED_TEST_CASE_P(VectorTest, SimdRegisterSetGet, +REGISTER_TYPED_TEST_CASE_P(RegisterTest, SimdRegisterSetGet, + SimdRegisterLoad, SimdRegisterAdd, SimdRegisterSubtract, SimdRegisterMultiply, SimdRegisterDivide, - SimdRegisterDotProduct); + SimdRegisterDotProduct, + SimdRegisterMax, + SimdRegisterMin); -INSTANTIATE_TYPED_TEST_CASE_P(SIMD, VectorTest, TestTypes); +INSTANTIATE_TYPED_TEST_CASE_P(SIMD, RegisterTest, TestTypes); From 42113c0064af2f7518f6d0f431e1c5918d20835c Mon Sep 17 00:00:00 2001 From: "Adam J. Kunen" Date: Wed, 6 Nov 2019 09:49:48 -0800 Subject: [PATCH 003/593] Some cleanup, changed to use a register policy --- include/RAJA/pattern/register.hpp | 47 +++ include/RAJA/pattern/vector.hpp | 283 ++++++++++++---- include/RAJA/policy/sequential/register.hpp | 23 ++ .../policy/sequential/register/scalar.hpp | 312 ++++++++++++++++++ include/RAJA/policy/simd/policy.hpp | 3 + include/RAJA/policy/simd/register.hpp | 28 ++ .../register/{double2.hpp => avx_double2.hpp} | 34 +- .../register/{double3.hpp => avx_double3.hpp} | 51 ++- .../register/{double4.hpp => avx_double4.hpp} | 37 ++- test/unit/test-vector.cpp | 20 +- 10 files changed, 743 insertions(+), 95 deletions(-) create mode 100644 include/RAJA/pattern/register.hpp create mode 100644 include/RAJA/policy/sequential/register.hpp create mode 100644 include/RAJA/policy/sequential/register/scalar.hpp create mode 100644 include/RAJA/policy/simd/register.hpp rename include/RAJA/policy/simd/register/{double2.hpp => avx_double2.hpp} (89%) rename include/RAJA/policy/simd/register/{double3.hpp => avx_double3.hpp} (87%) rename include/RAJA/policy/simd/register/{double4.hpp => avx_double4.hpp} (90%) diff --git a/include/RAJA/pattern/register.hpp b/include/RAJA/pattern/register.hpp new file mode 100644 index 0000000000..57f47b8a6d --- /dev/null +++ b/include/RAJA/pattern/register.hpp @@ -0,0 +1,47 @@ +/*! + ****************************************************************************** + * + * \file + * + * \brief RAJA header file defining SIMD/SIMT register operations. + * + ****************************************************************************** + */ + +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-19, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#ifndef RAJA_pattern_register_HPP +#define RAJA_pattern_register_HPP + +#include "RAJA/config.hpp" + +#include "RAJA/util/macros.hpp" + +namespace RAJA +{ + + +/*! + * \file + * Vector operation functions in the namespace RAJA + + * + */ + + template + class Register; + + +} // namespace RAJA + + +#include +#include + + +#endif diff --git a/include/RAJA/pattern/vector.hpp b/include/RAJA/pattern/vector.hpp index e685929df1..9457856fdf 100644 --- a/include/RAJA/pattern/vector.hpp +++ b/include/RAJA/pattern/vector.hpp @@ -3,7 +3,7 @@ * * \file * - * \brief RAJA header file defining vector operations. + * \brief RAJA header file defining SIMD/SIMT register operations. * ****************************************************************************** */ @@ -22,6 +22,8 @@ #include "RAJA/util/macros.hpp" +#include + namespace RAJA { @@ -33,46 +35,56 @@ namespace RAJA * */ - template - class SimdRegister; - + template + class FixedVector; - /** - * A specialization for a single element SIMD register. - * We will implement this as a scalar value, and let the compiler use - * whatever registers it deems appropriate. - */ - template - class SimdRegister{ + template class REGISTER_TYPE, typename REGISTER_POLICY, typename ELEMENT_TYPE, size_t NUM_REG_ELEM, size_t NUM_ELEM> + class FixedVector, NUM_ELEM> + { public: - using self_type = SimdRegister; - using element_type = T; + using full_register_type = + REGISTER_TYPE; + static constexpr size_t s_num_register_elem = NUM_REG_ELEM; + + using self_type = FixedVector; + using element_type = ELEMENT_TYPE; - static constexpr size_t s_num_elem = 1; - static constexpr size_t s_byte_width = sizeof(T); + + static constexpr size_t s_num_elem = NUM_ELEM; + static constexpr size_t s_byte_width = sizeof(element_type); static constexpr size_t s_bit_width = s_byte_width*8; - private: - T m_value; + static constexpr size_t s_num_full_registers = s_num_elem / s_num_register_elem; + + static constexpr size_t s_num_full_elem = s_num_full_registers*s_num_register_elem; + + static constexpr size_t s_num_partial_registers = + s_num_full_elem == s_num_elem ? 0 : 1; + + static constexpr size_t s_num_partial_elem = s_num_elem - s_num_full_elem; + + using partial_register_type = + REGISTER_TYPE; + + private: + std::array m_full_registers; + std::array m_partial_register; public: - /*! - * @brief Default constructor, zeros register contents - */ - SimdRegister() : m_value(0) { - } /*! - * @brief Copy constructor from underlying simd register + * @brief Default constructor, zeros register contents */ - SimdRegister(T const &c) : m_value(c) {} - + FixedVector() = default; /*! * @brief Copy constructor */ - SimdRegister(self_type const &c) : m_value(c.m_value) {} + FixedVector(self_type const &c) : + m_full_registers(c.m_full_registers), + m_partial_register(c.m_partial_register) + {} /*! @@ -80,7 +92,12 @@ namespace RAJA * locations. */ void load(element_type const *ptr){ - m_value = ptr[0]; + for(size_t i = 0;i < s_num_full_registers;++ i){ + m_full_registers[i].load(ptr + i*s_num_register_elem); + } + if(s_num_partial_registers){ + m_partial_register[0].load(ptr + s_num_full_elem); + } } /*! @@ -91,8 +108,13 @@ namespace RAJA * Note: this could be done with "gather" instructions if they are * available. (like in avx2, but not in avx) */ - void load(element_type const *ptr, size_t ){ - m_value = ptr[0]; + void load(element_type const *ptr, size_t stride){ + for(size_t i = 0;i < s_num_full_registers;++ i){ + m_full_registers[i].load(ptr + i*stride*s_num_register_elem, stride); + } + if(s_num_partial_registers){ + m_partial_register[0].load(ptr + stride*s_num_full_elem, stride); + } } @@ -101,7 +123,12 @@ namespace RAJA * locations. */ void store(element_type *ptr) const{ - ptr[0] = m_value; + for(size_t i = 0;i < s_num_full_registers;++ i){ + m_full_registers[i].store(ptr + i*s_num_register_elem); + } + if(s_num_partial_registers){ + m_partial_register[0].store(ptr + s_num_full_elem); + } } /*! @@ -112,21 +139,38 @@ namespace RAJA * Note: this could be done with "scatter" instructions if they are * available. */ - void store(element_type *ptr, size_t) const{ - ptr[0] = m_value; + void store(element_type *ptr, size_t stride) const{ + for(size_t i = 0;i < s_num_full_registers;++ i){ + m_full_registers[i].store(ptr + i*stride*s_num_register_elem, stride); + } + if(s_num_partial_registers){ + m_partial_register[0].store(ptr + stride*s_num_full_elem, stride); + } } /*! - * @brief Get scalar value from vector register + * @brief Get scalar value from vector + * This will not be the most efficient due to the offset calculation. * @param i Offset of scalar to get * @return Returns scalar value at i */ - template - constexpr - RAJA_INLINE - element_type operator[](IDX) const - {return m_value;} + element_type operator[](size_t i) const + { + // compute the register + size_t r = i/s_num_register_elem; + + // compute the element in the register (equiv: i % s_num_register_elem) + size_t e = i - (r*s_num_register_elem); + +// printf("i=%d, r=%d, e=%d, s_num_register_elem=%d\n", +// (int)i, (int)r, (int)e, (int)s_num_register_elem); + + if(r < s_num_full_registers){ + return m_full_registers[r][e]; + } + return m_partial_register[0][e]; + } /*! @@ -134,10 +178,24 @@ namespace RAJA * @param i Offset of scalar to set * @param value Value of scalar to set */ - template - RAJA_INLINE - void set(IDX , element_type value) - {m_value = value;} + void set(size_t i, element_type value) + { + // compute the register + size_t r = i/s_num_register_elem; + + // compute the element in the register (equiv: i % s_num_register_elem) + size_t e = i - (r*s_num_register_elem); + +// printf("i=%d, r=%d, e=%d, s_num_register_elem=%d\n", +// (int)i, (int)r, (int)e, (int)s_num_register_elem); + + if(r < s_num_full_registers){ + m_full_registers[r].set(e, value); + } + else{ + m_partial_register[0].set(e, value); + } + } /*! * @brief Set entire vector to a single scalar value @@ -146,8 +204,12 @@ namespace RAJA RAJA_INLINE self_type const &operator=(element_type value) { - m_value = value; - return *this; + for(size_t i = 0;i < s_num_full_registers;++ i){ + m_full_registers[i] = value; + } + if(s_num_partial_registers){ + m_partial_register[0] = value; + } } /*! @@ -158,7 +220,12 @@ namespace RAJA RAJA_INLINE self_type const &operator=(self_type const &x) { - m_value = x.m_value; + for(size_t i = 0;i < s_num_full_registers;++ i){ + m_full_registers[i] = x.m_full_registers[i]; + } + if(s_num_partial_registers){ + m_partial_register[0] = x.m_partial_register[0]; + } return *this; } @@ -171,7 +238,16 @@ namespace RAJA RAJA_INLINE self_type operator+(self_type const &x) const { - return self_type(m_value + x.m_value); + self_type result(*this); + + for(size_t i = 0;i < s_num_full_registers;++ i){ + result.m_full_registers[i] += x.m_full_registers[i]; + } + if(s_num_partial_registers){ + result.m_partial_register[0] += x.m_partial_register[0]; + } + + return result; } /*! @@ -182,7 +258,13 @@ namespace RAJA RAJA_INLINE self_type const &operator+=(self_type const &x) { - m_value = m_value + x.m_value; + for(size_t i = 0;i < s_num_full_registers;++ i){ + m_full_registers[i] += x.m_full_registers[i]; + } + if(s_num_partial_registers){ + m_partial_register[0] += x.m_partial_register[0]; + } + return *this; } @@ -194,7 +276,16 @@ namespace RAJA RAJA_INLINE self_type operator-(self_type const &x) const { - return self_type(m_value - x.m_value); + self_type result(*this); + + for(size_t i = 0;i < s_num_full_registers;++ i){ + result.m_full_registers[i] -= x.m_full_registers[i]; + } + if(s_num_partial_registers){ + result.m_partial_register[0] -= x.m_partial_register[0]; + } + + return result; } /*! @@ -205,7 +296,13 @@ namespace RAJA RAJA_INLINE self_type const &operator-=(self_type const &x) { - m_value = m_value - x.m_value; + for(size_t i = 0;i < s_num_full_registers;++ i){ + m_full_registers[i] -= x.m_full_registers[i]; + } + if(s_num_partial_registers){ + m_partial_register[0] -= x.m_partial_register[0]; + } + return *this; } @@ -217,7 +314,16 @@ namespace RAJA RAJA_INLINE self_type operator*(self_type const &x) const { - return self_type(m_value * x.m_value); + self_type result(*this); + + for(size_t i = 0;i < s_num_full_registers;++ i){ + result.m_full_registers[i] *= x.m_full_registers[i]; + } + if(s_num_partial_registers){ + result.m_partial_register[0] *= x.m_partial_register[0]; + } + + return result; } /*! @@ -228,7 +334,13 @@ namespace RAJA RAJA_INLINE self_type const &operator*=(self_type const &x) { - m_value = m_value * x.m_value; + for(size_t i = 0;i < s_num_full_registers;++ i){ + m_full_registers[i] *= x.m_full_registers[i]; + } + if(s_num_partial_registers){ + m_partial_register[0] *= x.m_partial_register[0]; + } + return *this; } @@ -240,7 +352,16 @@ namespace RAJA RAJA_INLINE self_type operator/(self_type const &x) const { - return self_type(m_value / x.m_value); + self_type result(*this); + + for(size_t i = 0;i < s_num_full_registers;++ i){ + result.m_full_registers[i] /= x.m_full_registers[i]; + } + if(s_num_partial_registers){ + result.m_partial_register[0] /= x.m_partial_register[0]; + } + + return result; } /*! @@ -251,7 +372,13 @@ namespace RAJA RAJA_INLINE self_type const &operator/=(self_type const &x) { - m_value = m_value / x.m_value; + for(size_t i = 0;i < s_num_full_registers;++ i){ + m_full_registers[i] /= x.m_full_registers[i]; + } + if(s_num_partial_registers){ + m_partial_register[0] /= x.m_partial_register[0]; + } + return *this; } @@ -262,7 +389,14 @@ namespace RAJA RAJA_INLINE element_type sum() const { - return m_value; + element_type result = (element_type)0; + for(size_t i = 0;i < s_num_full_registers;++ i){ + result += m_full_registers[i].sum(); + } + if(s_num_partial_registers){ + result += m_partial_register[0].sum(); + } + return result; } /*! @@ -273,7 +407,14 @@ namespace RAJA RAJA_INLINE element_type dot(self_type const &x) const { - return m_value*x.m_value; + element_type result = (element_type)0; + for(size_t i = 0;i < s_num_full_registers;++ i){ + result += m_full_registers[i].dot(x.m_full_registers[i]); + } + if(s_num_partial_registers){ + result += m_partial_register[0].dot(x.m_partial_register[0]); + } + return result; } @@ -284,7 +425,18 @@ namespace RAJA RAJA_INLINE element_type max() const { - return m_value; + if(s_num_full_registers == 0){ + return m_partial_register[0].max(); + } + + element_type result = (element_type)m_full_registers[0].max(); + for(size_t i = 1;i < s_num_full_registers;++ i){ + result = std::max(result, m_full_registers[i].max()); + } + if(s_num_partial_registers){ + result = std::max(result, m_partial_register[0].max()); + } + return result; } /*! @@ -294,16 +446,23 @@ namespace RAJA RAJA_INLINE element_type min() const { - return m_value; + if(s_num_full_registers == 0){ + return m_partial_register[0].min(); + } + + element_type result = (element_type)m_full_registers[0].min(); + for(size_t i = 1;i < s_num_full_registers;++ i){ + result = std::min(result, m_full_registers[i].min()); + } + if(s_num_partial_registers){ + result = std::min(result, m_partial_register[0].min()); + } + return result; } + }; } // namespace RAJA -#include -#include -#include - - #endif diff --git a/include/RAJA/policy/sequential/register.hpp b/include/RAJA/policy/sequential/register.hpp new file mode 100644 index 0000000000..7d0c54254a --- /dev/null +++ b/include/RAJA/policy/sequential/register.hpp @@ -0,0 +1,23 @@ +/*! + ****************************************************************************** + * + * \file + * + * \brief Header file containing RAJA simd policy definitions. + * + ****************************************************************************** + */ + +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-19, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#ifndef RAJA_policy_sequential_register_HPP +#define RAJA_policy_sequential_register_HPP + +#include + +#endif diff --git a/include/RAJA/policy/sequential/register/scalar.hpp b/include/RAJA/policy/sequential/register/scalar.hpp new file mode 100644 index 0000000000..4963556b9c --- /dev/null +++ b/include/RAJA/policy/sequential/register/scalar.hpp @@ -0,0 +1,312 @@ +/*! + ****************************************************************************** + * + * \file + * + * \brief RAJA header file defining SIMD/SIMT register operations. + * + ****************************************************************************** + */ + +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-19, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#ifndef RAJA_policy_sequential_register_scalar_HPP +#define RAJA_policy_sequential_register_scalar_HPP + +#include + +namespace RAJA +{ + + + /** + * A specialization for a single element register. + * We will implement this as a scalar value, and let the compiler use + * whatever registers it deems appropriate. + */ + template + class Register{ + public: + using self_type = Register; + using element_type = T; + + static constexpr size_t s_num_elem = 1; + static constexpr size_t s_byte_width = sizeof(T); + static constexpr size_t s_bit_width = s_byte_width*8; + + private: + T m_value; + + public: + + /*! + * @brief Default constructor, zeros register contents + */ + Register() : m_value(0) { + } + + /*! + * @brief Copy constructor from underlying simd register + */ + Register(T const &c) : m_value(c) {} + + + /*! + * @brief Copy constructor + */ + Register(self_type const &c) : m_value(c.m_value) {} + + + /*! + * @brief Load constructor, assuming scalars are in consecutive memory + * locations. + */ + void load(element_type const *ptr){ + m_value = ptr[0]; + } + + /*! + * @brief Strided load constructor, when scalars are located in memory + * locations ptr, ptr+stride, ptr+2*stride, etc. + * + * + * Note: this could be done with "gather" instructions if they are + * available. (like in avx2, but not in avx) + */ + void load(element_type const *ptr, size_t ){ + m_value = ptr[0]; + } + + + /*! + * @brief Store operation, assuming scalars are in consecutive memory + * locations. + */ + void store(element_type *ptr) const{ + ptr[0] = m_value; + } + + /*! + * @brief Strided store operation, where scalars are stored in memory + * locations ptr, ptr+stride, ptr+2*stride, etc. + * + * + * Note: this could be done with "scatter" instructions if they are + * available. + */ + void store(element_type *ptr, size_t) const{ + ptr[0] = m_value; + } + + + /*! + * @brief Get scalar value from vector register + * @param i Offset of scalar to get + * @return Returns scalar value at i + */ + template + constexpr + RAJA_INLINE + element_type operator[](IDX) const + {return m_value;} + + + /*! + * @brief Set scalar value in vector register + * @param i Offset of scalar to set + * @param value Value of scalar to set + */ + template + RAJA_INLINE + void set(IDX , element_type value) + {m_value = value;} + + /*! + * @brief Set entire vector to a single scalar value + * @param value Value to set all vector elements to + */ + RAJA_INLINE + self_type const &operator=(element_type value) + { + m_value = value; + return *this; + } + + /*! + * @brief Assign one register to antoher + * @param x Vector to copy + * @return Value of (*this) + */ + RAJA_INLINE + self_type const &operator=(self_type const &x) + { + m_value = x.m_value; + return *this; + } + + + /*! + * @brief Add two vector registers + * @param x Vector to add to this register + * @return Value of (*this)+x + */ + RAJA_INLINE + self_type operator+(self_type const &x) const + { + return self_type(m_value + x.m_value); + } + + /*! + * @brief Add a vector to this vector + * @param x Vector to add to this register + * @return Value of (*this)+x + */ + RAJA_INLINE + self_type const &operator+=(self_type const &x) + { + m_value = m_value + x.m_value; + return *this; + } + + /*! + * @brief Subtract two vector registers + * @param x Vector to subctract from this register + * @return Value of (*this)+x + */ + RAJA_INLINE + self_type operator-(self_type const &x) const + { + return self_type(m_value - x.m_value); + } + + /*! + * @brief Subtract a vector from this vector + * @param x Vector to subtract from this register + * @return Value of (*this)+x + */ + RAJA_INLINE + self_type const &operator-=(self_type const &x) + { + m_value = m_value - x.m_value; + return *this; + } + + /*! + * @brief Multiply two vector registers, element wise + * @param x Vector to subctract from this register + * @return Value of (*this)+x + */ + RAJA_INLINE + self_type operator*(self_type const &x) const + { + return self_type(m_value * x.m_value); + } + + /*! + * @brief Multiply a vector with this vector + * @param x Vector to multiple with this register + * @return Value of (*this)+x + */ + RAJA_INLINE + self_type const &operator*=(self_type const &x) + { + m_value = m_value * x.m_value; + return *this; + } + + /*! + * @brief Divide two vector registers, element wise + * @param x Vector to subctract from this register + * @return Value of (*this)+x + */ + RAJA_INLINE + self_type operator/(self_type const &x) const + { + return self_type(m_value / x.m_value); + } + + /*! + * @brief Divide this vector by another vector + * @param x Vector to divide by + * @return Value of (*this)+x + */ + RAJA_INLINE + self_type const &operator/=(self_type const &x) + { + m_value = m_value / x.m_value; + return *this; + } + + /*! + * @brief Sum the elements of this vector + * @return Sum of the values of the vectors scalar elements + */ + RAJA_INLINE + element_type sum() const + { + return m_value; + } + + /*! + * @brief Dot product of two vectors + * @param x Other vector to dot with this vector + * @return Value of (*this) dot x + */ + RAJA_INLINE + element_type dot(self_type const &x) const + { + return m_value*x.m_value; + } + + + /*! + * @brief Returns the largest element + * @return The largest scalar element in the register + */ + RAJA_INLINE + element_type max() const + { + return m_value; + } + + /*! + * @brief Returns element-wise largest values + * @return Vector of the element-wise max values + */ + RAJA_INLINE + self_type vmax(self_type a) const + { + return self_type(std::max(m_value, a.m_value)); + } + + /*! + * @brief Returns the largest element + * @return The largest scalar element in the register + */ + RAJA_INLINE + element_type min() const + { + return m_value; + } + + /*! + * @brief Returns element-wise largest values + * @return Vector of the element-wise max values + */ + RAJA_INLINE + self_type vmin(self_type a) const + { + return self_type(std::min(m_value, a.m_value)); + } + + }; + +} // namespace RAJA + + +#endif diff --git a/include/RAJA/policy/simd/policy.hpp b/include/RAJA/policy/simd/policy.hpp index 3a78f7f40f..73ecf7ffdf 100644 --- a/include/RAJA/policy/simd/policy.hpp +++ b/include/RAJA/policy/simd/policy.hpp @@ -44,11 +44,14 @@ struct simd_exec : make_policy_pattern_launch_platform_t { }; +struct simd_register{}; + } // end of namespace simd } // end of namespace policy using policy::simd::simd_exec; +using policy::simd::simd_register; } // end of namespace RAJA diff --git a/include/RAJA/policy/simd/register.hpp b/include/RAJA/policy/simd/register.hpp new file mode 100644 index 0000000000..ae9b6ad98f --- /dev/null +++ b/include/RAJA/policy/simd/register.hpp @@ -0,0 +1,28 @@ +/*! + ****************************************************************************** + * + * \file + * + * \brief Header file containing RAJA simd policy definitions. + * + ****************************************************************************** + */ + +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-19, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#ifndef RAJA_policy_simd_register_HPP +#define RAJA_policy_simd_register_HPP + +#include +#include + +#include +#include +#include + +#endif diff --git a/include/RAJA/policy/simd/register/double2.hpp b/include/RAJA/policy/simd/register/avx_double2.hpp similarity index 89% rename from include/RAJA/policy/simd/register/double2.hpp rename to include/RAJA/policy/simd/register/avx_double2.hpp index a5fdb5de0b..9ed7b48a68 100644 --- a/include/RAJA/policy/simd/register/double2.hpp +++ b/include/RAJA/policy/simd/register/avx_double2.hpp @@ -15,8 +15,8 @@ // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -#ifndef RAJA_policy_simd_register_double2_HPP -#define RAJA_policy_simd_register_double2_HPP +#ifndef RAJA_policy_simd_register_avx_double2_HPP +#define RAJA_policy_simd_register_avx_double2_HPP #include "RAJA/config.hpp" #include "RAJA/util/macros.hpp" @@ -31,9 +31,9 @@ namespace RAJA template<> - class SimdRegister{ + class Register{ public: - using self_type = SimdRegister; + using self_type = Register; using element_type = double; static constexpr size_t s_num_elem = 2; @@ -50,19 +50,19 @@ namespace RAJA /*! * @brief Default constructor, zeros register contents */ - SimdRegister() : m_value(_mm_setzero_pd()) { + Register() : m_value(_mm_setzero_pd()) { } /*! * @brief Copy constructor from underlying simd register */ - explicit SimdRegister(simd_type const &c) : m_value(c) {} + explicit Register(simd_type const &c) : m_value(c) {} /*! * @brief Copy constructor */ - SimdRegister(self_type const &c) : m_value(c.m_value) {} + Register(self_type const &c) : m_value(c.m_value) {} /*! * @brief Load operation, assuming scalars are in consecutive memory @@ -283,6 +283,16 @@ namespace RAJA return b[0]; } + /*! + * @brief Returns element-wise largest values + * @return Vector of the element-wise max values + */ + RAJA_INLINE + self_type vmax(self_type a) const + { + return self_type(_mm_max_pd(m_value, a.m_value)); + } + /*! * @brief Returns the largest element * @return The largest scalar element in the register @@ -299,6 +309,16 @@ namespace RAJA // return the lower lane return b[0]; } + + /*! + * @brief Returns element-wise largest values + * @return Vector of the element-wise max values + */ + RAJA_INLINE + self_type vmin(self_type a) const + { + return self_type(_mm_min_pd(m_value, a.m_value)); + } }; diff --git a/include/RAJA/policy/simd/register/double3.hpp b/include/RAJA/policy/simd/register/avx_double3.hpp similarity index 87% rename from include/RAJA/policy/simd/register/double3.hpp rename to include/RAJA/policy/simd/register/avx_double3.hpp index 2fdea6ed7c..e76124cfb4 100644 --- a/include/RAJA/policy/simd/register/double3.hpp +++ b/include/RAJA/policy/simd/register/avx_double3.hpp @@ -15,8 +15,8 @@ // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -#ifndef RAJA_policy_simd_register_double3_HPP -#define RAJA_policy_simd_register_double3_HPP +#ifndef RAJA_policy_simd_register_avx_double3_HPP +#define RAJA_policy_simd_register_avx_double3_HPP #include "RAJA/config.hpp" #include "RAJA/util/macros.hpp" @@ -31,9 +31,9 @@ namespace RAJA template<> - class SimdRegister{ + class Register{ public: - using self_type = SimdRegister; + using self_type = Register; using element_type = double; static constexpr size_t s_num_elem = 3; @@ -57,24 +57,28 @@ namespace RAJA /*! * @brief Default constructor, zeros register contents */ - SimdRegister() : m_value(_mm256_setzero_pd()) { + RAJA_INLINE + Register() : m_value(_mm256_setzero_pd()) { } /*! * @brief Copy constructor from underlying simd register */ - explicit SimdRegister(simd_type const &c) : m_value(c) {} + RAJA_INLINE + explicit Register(simd_type const &c) : m_value(c) {} /*! * @brief Copy constructor */ - SimdRegister(self_type const &c) : m_value(c.m_value) {} + RAJA_INLINE + Register(self_type const &c) : m_value(c.m_value) {} /*! * @brief Load constructor, assuming scalars are in consecutive memory * locations. */ + RAJA_INLINE void load(element_type const *ptr){ m_value = _mm256_maskload_pd(ptr, s_mask); } @@ -87,6 +91,7 @@ namespace RAJA * Note: this could be done with "gather" instructions if they are * available. (like in avx2, but not in avx) */ + RAJA_INLINE void load(element_type const *ptr, size_t stride){ m_value =_mm256_set_pd(0.0, ptr[2*stride], @@ -99,6 +104,7 @@ namespace RAJA * @brief Store operation, assuming scalars are in consecutive memory * locations. */ + RAJA_INLINE void store(element_type *ptr) const{ _mm256_maskstore_pd(ptr, m_value, s_mask); } @@ -111,6 +117,7 @@ namespace RAJA * Note: this could be done with "scatter" instructions if they are * available. */ + RAJA_INLINE void store(element_type *ptr, size_t stride) const{ for(size_t i = 0;i < s_num_elem;++ i){ ptr[i*stride] = m_value[i]; @@ -284,19 +291,26 @@ namespace RAJA RAJA_INLINE element_type max() const { - // permute the first two and last two lanes of the register + // permute the first two lanes of the register simd_type a = _mm256_shuffle_pd(m_value, m_value, 0x01); // take the minimum value of each lane - // this gives us b=XXYY where - // X = min(a[0], a[1]) - // Y = min(a[2], a[3]) simd_type b = _mm256_max_pd(m_value, a); // now take the minimum of a lower and upper lane return std::max(b[0], b[2]); } + /*! + * @brief Returns element-wise largest values + * @return Vector of the element-wise max values + */ + RAJA_INLINE + self_type vmax(self_type a) const + { + return self_type(_mm256_max_pd(m_value, a.m_value)); + } + /*! * @brief Returns the largest element * @return The largest scalar element in the register @@ -304,20 +318,27 @@ namespace RAJA RAJA_INLINE element_type min() const { - // permute the first two and last two lanes of the register + // permute the first two lanes of the register // m_value = ABCD // a = AACC simd_type a = _mm256_shuffle_pd(m_value, m_value, 0x01); // take the minimum value of each lane - // this gives us b=XXYY where - // X = min(a[0], a[1]) - // Y = min(a[2], a[3]) simd_type b = _mm256_min_pd(m_value, a); // now take the minimum of a lower and upper lane return std::min(b[0], b[2]); } + + /*! + * @brief Returns element-wise largest values + * @return Vector of the element-wise max values + */ + RAJA_INLINE + self_type vmin(self_type a) const + { + return self_type(_mm256_min_pd(m_value, a.m_value)); + } }; diff --git a/include/RAJA/policy/simd/register/double4.hpp b/include/RAJA/policy/simd/register/avx_double4.hpp similarity index 90% rename from include/RAJA/policy/simd/register/double4.hpp rename to include/RAJA/policy/simd/register/avx_double4.hpp index 94803082f3..228db7b1e8 100644 --- a/include/RAJA/policy/simd/register/double4.hpp +++ b/include/RAJA/policy/simd/register/avx_double4.hpp @@ -31,9 +31,9 @@ namespace RAJA template<> - class SimdRegister{ + class Register{ public: - using self_type = SimdRegister; + using self_type = Register; using element_type = double; static constexpr size_t s_num_elem = 4; @@ -50,25 +50,29 @@ namespace RAJA /*! * @brief Default constructor, zeros register contents */ - SimdRegister() : m_value(_mm256_setzero_pd()) { + RAJA_INLINE + Register() : m_value(_mm256_setzero_pd()) { } /*! * @brief Copy constructor from underlying simd register */ - explicit SimdRegister(simd_type const &c) : m_value(c) {} + RAJA_INLINE + explicit Register(simd_type const &c) : m_value(c) {} /*! * @brief Copy constructor */ - SimdRegister(self_type const &c) : m_value(c.m_value) {} + RAJA_INLINE + Register(self_type const &c) : m_value(c.m_value) {} /*! * @brief Load constructor, assuming scalars are in consecutive memory * locations. */ + RAJA_INLINE void load(element_type const *ptr){ m_value = _mm256_loadu_pd(ptr); } @@ -81,6 +85,7 @@ namespace RAJA * Note: this could be done with "gather" instructions if they are * available. (like in avx2, but not in avx) */ + RAJA_INLINE void load(element_type const *ptr, size_t stride){ m_value =_mm256_set_pd(ptr[3*stride], ptr[2*stride], @@ -94,6 +99,7 @@ namespace RAJA * @brief Store operation, assuming scalars are in consecutive memory * locations. */ + RAJA_INLINE void store(element_type *ptr) const{ _mm256_storeu_pd(ptr, m_value); } @@ -106,6 +112,7 @@ namespace RAJA * Note: this could be done with "scatter" instructions if they are * available. */ + RAJA_INLINE void store(element_type *ptr, size_t stride) const{ for(size_t i = 0;i < s_num_elem;++ i){ ptr[i*stride] = m_value[i]; @@ -292,6 +299,16 @@ namespace RAJA return std::max(b[0], b[2]); } + /*! + * @brief Returns element-wise largest values + * @return Vector of the element-wise max values + */ + RAJA_INLINE + self_type vmax(self_type a) const + { + return self_type(_mm256_max_pd(m_value, a.m_value)); + } + /*! * @brief Returns the largest element * @return The largest scalar element in the register @@ -313,6 +330,16 @@ namespace RAJA // now take the minimum of a lower and upper lane return std::min(b[0], b[2]); } + + /*! + * @brief Returns element-wise largest values + * @return Vector of the element-wise max values + */ + RAJA_INLINE + self_type vmin(self_type a) const + { + return self_type(_mm256_min_pd(m_value, a.m_value)); + } }; diff --git a/test/unit/test-vector.cpp b/test/unit/test-vector.cpp index 96c68fa220..0845f5475b 100644 --- a/test/unit/test-vector.cpp +++ b/test/unit/test-vector.cpp @@ -6,24 +6,32 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// /// -/// Source file containing tests for basic vector operations +/// Source file containing tests for basic simd/simt vector operations /// #include "RAJA/RAJA.hpp" #include "gtest/gtest.h" +#include "RAJA/pattern/register.hpp" #include "RAJA/pattern/vector.hpp" +namespace RAJA{ +template +using SimdRegister = Register; +} - -using TestTypes = ::testing::Types, +using RegisterTestTypes = ::testing::Types, RAJA::SimdRegister, RAJA::SimdRegister, RAJA::SimdRegister, RAJA::SimdRegister, - RAJA::SimdRegister>; - + RAJA::SimdRegister, + RAJA::FixedVector, 27>, + RAJA::FixedVector, 27>, + RAJA::FixedVector, 27>, + RAJA::FixedVector, 27>>; +//usingRegister TestTypes = ::testing::Types, 27>>; template class RegisterTest : public ::testing::Test @@ -316,4 +324,4 @@ REGISTER_TYPED_TEST_CASE_P(RegisterTest, SimdRegisterSetGet, SimdRegisterMax, SimdRegisterMin); -INSTANTIATE_TYPED_TEST_CASE_P(SIMD, RegisterTest, TestTypes); +INSTANTIATE_TYPED_TEST_CASE_P(SIMD, RegisterTest, RegisterTestTypes); From 5483dbbfc1ac75886ee0f74e1fbe8e9c31bedfa1 Mon Sep 17 00:00:00 2001 From: "Adam J. Kunen" Date: Wed, 6 Nov 2019 15:24:38 -0800 Subject: [PATCH 004/593] Working forall policies for fixed and streaming --- include/RAJA/pattern/register.hpp | 2 - include/RAJA/pattern/vector.hpp | 447 +----------------- include/RAJA/pattern/vector/FixedVector.hpp | 470 +++++++++++++++++++ include/RAJA/pattern/vector/StreamVector.hpp | 456 ++++++++++++++++++ include/RAJA/policy/sequential.hpp | 1 + include/RAJA/policy/simd.hpp | 1 + include/RAJA/policy/simd/forall.hpp | 47 ++ include/RAJA/policy/simd/policy.hpp | 77 +++ test/unit/test-vector.cpp | 203 +++++++- 9 files changed, 1242 insertions(+), 462 deletions(-) create mode 100644 include/RAJA/pattern/vector/FixedVector.hpp create mode 100644 include/RAJA/pattern/vector/StreamVector.hpp diff --git a/include/RAJA/pattern/register.hpp b/include/RAJA/pattern/register.hpp index 57f47b8a6d..561b8c9bf2 100644 --- a/include/RAJA/pattern/register.hpp +++ b/include/RAJA/pattern/register.hpp @@ -40,8 +40,6 @@ namespace RAJA } // namespace RAJA -#include -#include #endif diff --git a/include/RAJA/pattern/vector.hpp b/include/RAJA/pattern/vector.hpp index 9457856fdf..6039e73aef 100644 --- a/include/RAJA/pattern/vector.hpp +++ b/include/RAJA/pattern/vector.hpp @@ -18,451 +18,8 @@ #ifndef RAJA_pattern_vector_HPP #define RAJA_pattern_vector_HPP -#include "RAJA/config.hpp" - -#include "RAJA/util/macros.hpp" - -#include - -namespace RAJA -{ - - -/*! - * \file - * Vector operation functions in the namespace RAJA - - * - */ - - template - class FixedVector; - - template class REGISTER_TYPE, typename REGISTER_POLICY, typename ELEMENT_TYPE, size_t NUM_REG_ELEM, size_t NUM_ELEM> - class FixedVector, NUM_ELEM> - { - public: - using full_register_type = - REGISTER_TYPE; - static constexpr size_t s_num_register_elem = NUM_REG_ELEM; - - using self_type = FixedVector; - using element_type = ELEMENT_TYPE; - - - static constexpr size_t s_num_elem = NUM_ELEM; - static constexpr size_t s_byte_width = sizeof(element_type); - static constexpr size_t s_bit_width = s_byte_width*8; - - - static constexpr size_t s_num_full_registers = s_num_elem / s_num_register_elem; - - static constexpr size_t s_num_full_elem = s_num_full_registers*s_num_register_elem; - - static constexpr size_t s_num_partial_registers = - s_num_full_elem == s_num_elem ? 0 : 1; - - static constexpr size_t s_num_partial_elem = s_num_elem - s_num_full_elem; - - using partial_register_type = - REGISTER_TYPE; - - private: - std::array m_full_registers; - std::array m_partial_register; - public: - - - /*! - * @brief Default constructor, zeros register contents - */ - FixedVector() = default; - - /*! - * @brief Copy constructor - */ - FixedVector(self_type const &c) : - m_full_registers(c.m_full_registers), - m_partial_register(c.m_partial_register) - {} - - - /*! - * @brief Load constructor, assuming scalars are in consecutive memory - * locations. - */ - void load(element_type const *ptr){ - for(size_t i = 0;i < s_num_full_registers;++ i){ - m_full_registers[i].load(ptr + i*s_num_register_elem); - } - if(s_num_partial_registers){ - m_partial_register[0].load(ptr + s_num_full_elem); - } - } - - /*! - * @brief Strided load constructor, when scalars are located in memory - * locations ptr, ptr+stride, ptr+2*stride, etc. - * - * - * Note: this could be done with "gather" instructions if they are - * available. (like in avx2, but not in avx) - */ - void load(element_type const *ptr, size_t stride){ - for(size_t i = 0;i < s_num_full_registers;++ i){ - m_full_registers[i].load(ptr + i*stride*s_num_register_elem, stride); - } - if(s_num_partial_registers){ - m_partial_register[0].load(ptr + stride*s_num_full_elem, stride); - } - } - - - /*! - * @brief Store operation, assuming scalars are in consecutive memory - * locations. - */ - void store(element_type *ptr) const{ - for(size_t i = 0;i < s_num_full_registers;++ i){ - m_full_registers[i].store(ptr + i*s_num_register_elem); - } - if(s_num_partial_registers){ - m_partial_register[0].store(ptr + s_num_full_elem); - } - } - - /*! - * @brief Strided store operation, where scalars are stored in memory - * locations ptr, ptr+stride, ptr+2*stride, etc. - * - * - * Note: this could be done with "scatter" instructions if they are - * available. - */ - void store(element_type *ptr, size_t stride) const{ - for(size_t i = 0;i < s_num_full_registers;++ i){ - m_full_registers[i].store(ptr + i*stride*s_num_register_elem, stride); - } - if(s_num_partial_registers){ - m_partial_register[0].store(ptr + stride*s_num_full_elem, stride); - } - } - - - /*! - * @brief Get scalar value from vector - * This will not be the most efficient due to the offset calculation. - * @param i Offset of scalar to get - * @return Returns scalar value at i - */ - element_type operator[](size_t i) const - { - // compute the register - size_t r = i/s_num_register_elem; - - // compute the element in the register (equiv: i % s_num_register_elem) - size_t e = i - (r*s_num_register_elem); - -// printf("i=%d, r=%d, e=%d, s_num_register_elem=%d\n", -// (int)i, (int)r, (int)e, (int)s_num_register_elem); - - if(r < s_num_full_registers){ - return m_full_registers[r][e]; - } - return m_partial_register[0][e]; - } - - - /*! - * @brief Set scalar value in vector register - * @param i Offset of scalar to set - * @param value Value of scalar to set - */ - void set(size_t i, element_type value) - { - // compute the register - size_t r = i/s_num_register_elem; - - // compute the element in the register (equiv: i % s_num_register_elem) - size_t e = i - (r*s_num_register_elem); - -// printf("i=%d, r=%d, e=%d, s_num_register_elem=%d\n", -// (int)i, (int)r, (int)e, (int)s_num_register_elem); - - if(r < s_num_full_registers){ - m_full_registers[r].set(e, value); - } - else{ - m_partial_register[0].set(e, value); - } - } - - /*! - * @brief Set entire vector to a single scalar value - * @param value Value to set all vector elements to - */ - RAJA_INLINE - self_type const &operator=(element_type value) - { - for(size_t i = 0;i < s_num_full_registers;++ i){ - m_full_registers[i] = value; - } - if(s_num_partial_registers){ - m_partial_register[0] = value; - } - } - - /*! - * @brief Assign one register to antoher - * @param x Vector to copy - * @return Value of (*this) - */ - RAJA_INLINE - self_type const &operator=(self_type const &x) - { - for(size_t i = 0;i < s_num_full_registers;++ i){ - m_full_registers[i] = x.m_full_registers[i]; - } - if(s_num_partial_registers){ - m_partial_register[0] = x.m_partial_register[0]; - } - return *this; - } - - - /*! - * @brief Add two vector registers - * @param x Vector to add to this register - * @return Value of (*this)+x - */ - RAJA_INLINE - self_type operator+(self_type const &x) const - { - self_type result(*this); - - for(size_t i = 0;i < s_num_full_registers;++ i){ - result.m_full_registers[i] += x.m_full_registers[i]; - } - if(s_num_partial_registers){ - result.m_partial_register[0] += x.m_partial_register[0]; - } - - return result; - } - - /*! - * @brief Add a vector to this vector - * @param x Vector to add to this register - * @return Value of (*this)+x - */ - RAJA_INLINE - self_type const &operator+=(self_type const &x) - { - for(size_t i = 0;i < s_num_full_registers;++ i){ - m_full_registers[i] += x.m_full_registers[i]; - } - if(s_num_partial_registers){ - m_partial_register[0] += x.m_partial_register[0]; - } - - return *this; - } - - /*! - * @brief Subtract two vector registers - * @param x Vector to subctract from this register - * @return Value of (*this)+x - */ - RAJA_INLINE - self_type operator-(self_type const &x) const - { - self_type result(*this); - - for(size_t i = 0;i < s_num_full_registers;++ i){ - result.m_full_registers[i] -= x.m_full_registers[i]; - } - if(s_num_partial_registers){ - result.m_partial_register[0] -= x.m_partial_register[0]; - } - - return result; - } - - /*! - * @brief Subtract a vector from this vector - * @param x Vector to subtract from this register - * @return Value of (*this)+x - */ - RAJA_INLINE - self_type const &operator-=(self_type const &x) - { - for(size_t i = 0;i < s_num_full_registers;++ i){ - m_full_registers[i] -= x.m_full_registers[i]; - } - if(s_num_partial_registers){ - m_partial_register[0] -= x.m_partial_register[0]; - } - - return *this; - } - - /*! - * @brief Multiply two vector registers, element wise - * @param x Vector to subctract from this register - * @return Value of (*this)+x - */ - RAJA_INLINE - self_type operator*(self_type const &x) const - { - self_type result(*this); - - for(size_t i = 0;i < s_num_full_registers;++ i){ - result.m_full_registers[i] *= x.m_full_registers[i]; - } - if(s_num_partial_registers){ - result.m_partial_register[0] *= x.m_partial_register[0]; - } - - return result; - } - - /*! - * @brief Multiply a vector with this vector - * @param x Vector to multiple with this register - * @return Value of (*this)+x - */ - RAJA_INLINE - self_type const &operator*=(self_type const &x) - { - for(size_t i = 0;i < s_num_full_registers;++ i){ - m_full_registers[i] *= x.m_full_registers[i]; - } - if(s_num_partial_registers){ - m_partial_register[0] *= x.m_partial_register[0]; - } - - return *this; - } - - /*! - * @brief Divide two vector registers, element wise - * @param x Vector to subctract from this register - * @return Value of (*this)+x - */ - RAJA_INLINE - self_type operator/(self_type const &x) const - { - self_type result(*this); - - for(size_t i = 0;i < s_num_full_registers;++ i){ - result.m_full_registers[i] /= x.m_full_registers[i]; - } - if(s_num_partial_registers){ - result.m_partial_register[0] /= x.m_partial_register[0]; - } - - return result; - } - - /*! - * @brief Divide this vector by another vector - * @param x Vector to divide by - * @return Value of (*this)+x - */ - RAJA_INLINE - self_type const &operator/=(self_type const &x) - { - for(size_t i = 0;i < s_num_full_registers;++ i){ - m_full_registers[i] /= x.m_full_registers[i]; - } - if(s_num_partial_registers){ - m_partial_register[0] /= x.m_partial_register[0]; - } - - return *this; - } - - /*! - * @brief Sum the elements of this vector - * @return Sum of the values of the vectors scalar elements - */ - RAJA_INLINE - element_type sum() const - { - element_type result = (element_type)0; - for(size_t i = 0;i < s_num_full_registers;++ i){ - result += m_full_registers[i].sum(); - } - if(s_num_partial_registers){ - result += m_partial_register[0].sum(); - } - return result; - } - - /*! - * @brief Dot product of two vectors - * @param x Other vector to dot with this vector - * @return Value of (*this) dot x - */ - RAJA_INLINE - element_type dot(self_type const &x) const - { - element_type result = (element_type)0; - for(size_t i = 0;i < s_num_full_registers;++ i){ - result += m_full_registers[i].dot(x.m_full_registers[i]); - } - if(s_num_partial_registers){ - result += m_partial_register[0].dot(x.m_partial_register[0]); - } - return result; - } - - - /*! - * @brief Returns the largest element - * @return The largest scalar element in the register - */ - RAJA_INLINE - element_type max() const - { - if(s_num_full_registers == 0){ - return m_partial_register[0].max(); - } - - element_type result = (element_type)m_full_registers[0].max(); - for(size_t i = 1;i < s_num_full_registers;++ i){ - result = std::max(result, m_full_registers[i].max()); - } - if(s_num_partial_registers){ - result = std::max(result, m_partial_register[0].max()); - } - return result; - } - - /*! - * @brief Returns the largest element - * @return The largest scalar element in the register - */ - RAJA_INLINE - element_type min() const - { - if(s_num_full_registers == 0){ - return m_partial_register[0].min(); - } - - element_type result = (element_type)m_full_registers[0].min(); - for(size_t i = 1;i < s_num_full_registers;++ i){ - result = std::min(result, m_full_registers[i].min()); - } - if(s_num_partial_registers){ - result = std::min(result, m_partial_register[0].min()); - } - return result; - } - - }; - -} // namespace RAJA +#include "RAJA/pattern/vector/FixedVector.hpp" +#include "RAJA/pattern/vector/StreamVector.hpp" #endif diff --git a/include/RAJA/pattern/vector/FixedVector.hpp b/include/RAJA/pattern/vector/FixedVector.hpp new file mode 100644 index 0000000000..10fc3878c4 --- /dev/null +++ b/include/RAJA/pattern/vector/FixedVector.hpp @@ -0,0 +1,470 @@ +/*! + ****************************************************************************** + * + * \file + * + * \brief RAJA header file defining SIMD/SIMT register operations. + * + ****************************************************************************** + */ + +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-19, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#ifndef RAJA_pattern_vector_fixedvector_HPP +#define RAJA_pattern_vector_fixedvector_HPP + +#include "RAJA/config.hpp" + +#include "RAJA/util/macros.hpp" + +#include + +namespace RAJA +{ + + +/*! + * \file + * Vector operation functions in the namespace RAJA + + * + */ + + template + class FixedVector; + + template class REGISTER_TYPE, typename REGISTER_POLICY, typename ELEMENT_TYPE, size_t NUM_REG_ELEM, size_t NUM_ELEM> + class FixedVector, NUM_ELEM> + { + public: + using full_register_type = + REGISTER_TYPE; + static constexpr size_t s_num_register_elem = NUM_REG_ELEM; + + using self_type = FixedVector; + using element_type = ELEMENT_TYPE; + + + static constexpr size_t s_num_elem = NUM_ELEM; + static constexpr size_t s_byte_width = sizeof(element_type); + static constexpr size_t s_bit_width = s_byte_width*8; + + + static constexpr size_t s_num_full_registers = s_num_elem / s_num_register_elem; + + static constexpr size_t s_num_full_elem = s_num_full_registers*s_num_register_elem; + + static constexpr size_t s_num_partial_registers = + s_num_full_elem == s_num_elem ? 0 : 1; + + static constexpr size_t s_num_partial_elem = s_num_elem - s_num_full_elem; + + using partial_register_type = + REGISTER_TYPE; + + private: + std::array m_full_registers; + std::array m_partial_register; + public: + + + /*! + * @brief Default constructor, zeros register contents + */ + FixedVector() = default; + + /*! + * @brief Copy constructor + */ + RAJA_INLINE + FixedVector(self_type const &c) : + m_full_registers(c.m_full_registers), + m_partial_register(c.m_partial_register) + {} + + + /*! + * @brief Load constructor, assuming scalars are in consecutive memory + * locations. + */ + RAJA_INLINE + void load(element_type const *ptr){ + for(size_t i = 0;i < s_num_full_registers;++ i){ + m_full_registers[i].load(ptr + i*s_num_register_elem); + } + if(s_num_partial_registers){ + m_partial_register[0].load(ptr + s_num_full_elem); + } + } + + /*! + * @brief Strided load constructor, when scalars are located in memory + * locations ptr, ptr+stride, ptr+2*stride, etc. + * + * + * Note: this could be done with "gather" instructions if they are + * available. (like in avx2, but not in avx) + */ + RAJA_INLINE + void load(element_type const *ptr, size_t stride){ + for(size_t i = 0;i < s_num_full_registers;++ i){ + m_full_registers[i].load(ptr + i*stride*s_num_register_elem, stride); + } + if(s_num_partial_registers){ + m_partial_register[0].load(ptr + stride*s_num_full_elem, stride); + } + } + + + /*! + * @brief Store operation, assuming scalars are in consecutive memory + * locations. + */ + RAJA_INLINE + void store(element_type *ptr) const{ + for(size_t i = 0;i < s_num_full_registers;++ i){ + m_full_registers[i].store(ptr + i*s_num_register_elem); + } + if(s_num_partial_registers){ + m_partial_register[0].store(ptr + s_num_full_elem); + } + } + + /*! + * @brief Strided store operation, where scalars are stored in memory + * locations ptr, ptr+stride, ptr+2*stride, etc. + * + * + * Note: this could be done with "scatter" instructions if they are + * available. + */ + RAJA_INLINE + void store(element_type *ptr, size_t stride) const{ + for(size_t i = 0;i < s_num_full_registers;++ i){ + m_full_registers[i].store(ptr + i*stride*s_num_register_elem, stride); + } + if(s_num_partial_registers){ + m_partial_register[0].store(ptr + stride*s_num_full_elem, stride); + } + } + + + /*! + * @brief Get scalar value from vector + * This will not be the most efficient due to the offset calculation. + * @param i Offset of scalar to get + * @return Returns scalar value at i + */ + RAJA_INLINE + element_type operator[](size_t i) const + { + // compute the register + size_t r = i/s_num_register_elem; + + // compute the element in the register (equiv: i % s_num_register_elem) + size_t e = i - (r*s_num_register_elem); + + if(r < s_num_full_registers){ + return m_full_registers[r][e]; + } + return m_partial_register[0][e]; + } + + + /*! + * @brief Set scalar value in vector register + * @param i Offset of scalar to set + * @param value Value of scalar to set + */ + RAJA_INLINE + void set(size_t i, element_type value) + { + // compute the register + size_t r = i/s_num_register_elem; + + // compute the element in the register (equiv: i % s_num_register_elem) + size_t e = i - (r*s_num_register_elem); + + if(r < s_num_full_registers){ + m_full_registers[r].set(e, value); + } + else{ + m_partial_register[0].set(e, value); + } + } + + /*! + * @brief Set entire vector to a single scalar value + * @param value Value to set all vector elements to + */ + RAJA_INLINE + self_type const &operator=(element_type value) + { + for(size_t i = 0;i < s_num_full_registers;++ i){ + m_full_registers[i] = value; + } + if(s_num_partial_registers){ + m_partial_register[0] = value; + } + return *this; + } + + /*! + * @brief Assign one register to antoher + * @param x Vector to copy + * @return Value of (*this) + */ + RAJA_INLINE + self_type const &operator=(self_type const &x) + { + for(size_t i = 0;i < s_num_full_registers;++ i){ + m_full_registers[i] = x.m_full_registers[i]; + } + if(s_num_partial_registers){ + m_partial_register[0] = x.m_partial_register[0]; + } + return *this; + } + + + /*! + * @brief Add two vector registers + * @param x Vector to add to this register + * @return Value of (*this)+x + */ + RAJA_INLINE + self_type operator+(self_type const &x) const + { + self_type result(*this); + + for(size_t i = 0;i < s_num_full_registers;++ i){ + result.m_full_registers[i] += x.m_full_registers[i]; + } + if(s_num_partial_registers){ + result.m_partial_register[0] += x.m_partial_register[0]; + } + + return result; + } + + /*! + * @brief Add a vector to this vector + * @param x Vector to add to this register + * @return Value of (*this)+x + */ + RAJA_INLINE + self_type const &operator+=(self_type const &x) + { + for(size_t i = 0;i < s_num_full_registers;++ i){ + m_full_registers[i] += x.m_full_registers[i]; + } + if(s_num_partial_registers){ + m_partial_register[0] += x.m_partial_register[0]; + } + + return *this; + } + + /*! + * @brief Subtract two vector registers + * @param x Vector to subctract from this register + * @return Value of (*this)+x + */ + RAJA_INLINE + self_type operator-(self_type const &x) const + { + self_type result(*this); + + for(size_t i = 0;i < s_num_full_registers;++ i){ + result.m_full_registers[i] -= x.m_full_registers[i]; + } + if(s_num_partial_registers){ + result.m_partial_register[0] -= x.m_partial_register[0]; + } + + return result; + } + + /*! + * @brief Subtract a vector from this vector + * @param x Vector to subtract from this register + * @return Value of (*this)+x + */ + RAJA_INLINE + self_type const &operator-=(self_type const &x) + { + for(size_t i = 0;i < s_num_full_registers;++ i){ + m_full_registers[i] -= x.m_full_registers[i]; + } + if(s_num_partial_registers){ + m_partial_register[0] -= x.m_partial_register[0]; + } + + return *this; + } + + /*! + * @brief Multiply two vector registers, element wise + * @param x Vector to subctract from this register + * @return Value of (*this)+x + */ + RAJA_INLINE + self_type operator*(self_type const &x) const + { + self_type result(*this); + + for(size_t i = 0;i < s_num_full_registers;++ i){ + result.m_full_registers[i] *= x.m_full_registers[i]; + } + if(s_num_partial_registers){ + result.m_partial_register[0] *= x.m_partial_register[0]; + } + + return result; + } + + /*! + * @brief Multiply a vector with this vector + * @param x Vector to multiple with this register + * @return Value of (*this)+x + */ + RAJA_INLINE + self_type const &operator*=(self_type const &x) + { + for(size_t i = 0;i < s_num_full_registers;++ i){ + m_full_registers[i] *= x.m_full_registers[i]; + } + if(s_num_partial_registers){ + m_partial_register[0] *= x.m_partial_register[0]; + } + + return *this; + } + + /*! + * @brief Divide two vector registers, element wise + * @param x Vector to subctract from this register + * @return Value of (*this)+x + */ + RAJA_INLINE + self_type operator/(self_type const &x) const + { + self_type result(*this); + + for(size_t i = 0;i < s_num_full_registers;++ i){ + result.m_full_registers[i] /= x.m_full_registers[i]; + } + if(s_num_partial_registers){ + result.m_partial_register[0] /= x.m_partial_register[0]; + } + + return result; + } + + /*! + * @brief Divide this vector by another vector + * @param x Vector to divide by + * @return Value of (*this)+x + */ + RAJA_INLINE + self_type const &operator/=(self_type const &x) + { + for(size_t i = 0;i < s_num_full_registers;++ i){ + m_full_registers[i] /= x.m_full_registers[i]; + } + if(s_num_partial_registers){ + m_partial_register[0] /= x.m_partial_register[0]; + } + + return *this; + } + + /*! + * @brief Sum the elements of this vector + * @return Sum of the values of the vectors scalar elements + */ + RAJA_INLINE + element_type sum() const + { + element_type result = (element_type)0; + for(size_t i = 0;i < s_num_full_registers;++ i){ + result += m_full_registers[i].sum(); + } + if(s_num_partial_registers){ + result += m_partial_register[0].sum(); + } + return result; + } + + /*! + * @brief Dot product of two vectors + * @param x Other vector to dot with this vector + * @return Value of (*this) dot x + */ + RAJA_INLINE + element_type dot(self_type const &x) const + { + element_type result = (element_type)0; + for(size_t i = 0;i < s_num_full_registers;++ i){ + result += m_full_registers[i].dot(x.m_full_registers[i]); + } + if(s_num_partial_registers){ + result += m_partial_register[0].dot(x.m_partial_register[0]); + } + return result; + } + + + /*! + * @brief Returns the largest element + * @return The largest scalar element in the register + */ + RAJA_INLINE + element_type max() const + { + if(s_num_full_registers == 0){ + return m_partial_register[0].max(); + } + + element_type result = (element_type)m_full_registers[0].max(); + for(size_t i = 1;i < s_num_full_registers;++ i){ + result = std::max(result, m_full_registers[i].max()); + } + if(s_num_partial_registers){ + result = std::max(result, m_partial_register[0].max()); + } + return result; + } + + /*! + * @brief Returns the largest element + * @return The largest scalar element in the register + */ + RAJA_INLINE + element_type min() const + { + if(s_num_full_registers == 0){ + return m_partial_register[0].min(); + } + + element_type result = (element_type)m_full_registers[0].min(); + for(size_t i = 1;i < s_num_full_registers;++ i){ + result = std::min(result, m_full_registers[i].min()); + } + if(s_num_partial_registers){ + result = std::min(result, m_partial_register[0].min()); + } + return result; + } + + }; + +} // namespace RAJA + + +#endif diff --git a/include/RAJA/pattern/vector/StreamVector.hpp b/include/RAJA/pattern/vector/StreamVector.hpp new file mode 100644 index 0000000000..d7e8fe0ffb --- /dev/null +++ b/include/RAJA/pattern/vector/StreamVector.hpp @@ -0,0 +1,456 @@ +/*! + ****************************************************************************** + * + * \file + * + * \brief RAJA header file defining SIMD/SIMT register operations. + * + ****************************************************************************** + */ + +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-19, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#ifndef RAJA_pattern_vector_streamvector_HPP +#define RAJA_pattern_vector_streamvector_HPP + +#include "RAJA/config.hpp" +#include "RAJA/util/macros.hpp" + +namespace RAJA +{ + + +/*! + * \file + * Vector operation functions in the namespace RAJA + + * + */ + + template + class StreamVector; + + template class REGISTER_TYPE, typename REGISTER_POLICY, typename ELEMENT_TYPE, size_t NUM_REG_ELEM, size_t MAX_ELEM> + class StreamVector, MAX_ELEM> + { + public: + using register_type = + REGISTER_TYPE; + static constexpr size_t s_num_register_elem = NUM_REG_ELEM; + + using self_type = StreamVector; + using element_type = ELEMENT_TYPE; + + static constexpr size_t s_num_elem = MAX_ELEM; + static constexpr size_t s_num_registers = + s_num_elem / s_num_register_elem; + + static_assert(s_num_elem % s_num_register_elem == 0, + "StreamVector must use a whole number of registers"); + + + private: + std::array m_registers; + size_t m_length; + + public: + + + /*! + * @brief Default constructor, zeros register contents + */ + RAJA_INLINE + StreamVector() : m_length(s_num_elem) {} + + /*! + * @brief Copy constructor + */ + RAJA_INLINE + StreamVector(self_type const &c) : + m_registers(c.m_registers), + m_length(c.m_length) + {} + + + /*! + * @brief Get scalar value from vector + * This will not be the most efficient due to the offset calculation. + * @param i Offset of scalar to get + * @return Returns scalar value at i + */ + RAJA_INLINE + element_type operator[](size_t i) const + { + // compute the register + size_t r = i/s_num_register_elem; + + // compute the element in the register (equiv: i % s_num_register_elem) + size_t e = i - (r*s_num_register_elem); + + return m_registers[r][e]; + } + + + /*! + * @brief Set scalar value in vector register + * @param i Offset of scalar to set + * @param value Value of scalar to set + */ + RAJA_INLINE + void set(size_t i, element_type value) + { + // compute the register + size_t r = i/s_num_register_elem; + + // compute the element in the register (equiv: i % s_num_register_elem) + size_t e = i - (r*s_num_register_elem); + + m_registers[r].set(e, value); + } + + + /*! + * @brief Load constructor, assuming scalars are in consecutive memory + * locations. + */ + RAJA_INLINE + void load(element_type const *ptr){ + m_length = s_num_elem; + for(size_t i = 0;i < s_num_registers;++ i){ + m_registers[i].load(ptr + i*s_num_register_elem); + } + } + + /*! + * @brief Strided load constructor, when scalars are located in memory + * locations ptr, ptr+stride, ptr+2*stride, etc. + * + * + * Note: this could be done with "gather" instructions if they are + * available. (like in avx2, but not in avx) + */ + RAJA_INLINE + void load(element_type const *ptr, size_t stride){ + m_length = s_num_elem; + for(size_t i = 0;i < s_num_registers;++ i){ + m_registers[i].load(ptr + i*s_num_register_elem*stride, stride); + } + } + + + /*! + * @brief Load constructor, assuming scalars are in consecutive memory + * locations. + */ + void load_n(element_type const *ptr, size_t len){ + if(len == s_num_elem){ + load(ptr); + } + else{ + m_length = len; + for(size_t i = 0;i < len;++ i){ + set(i, ptr[i]); + } + } + } + + /*! + * @brief Strided load constructor, when scalars are located in memory + * locations ptr, ptr+stride, ptr+2*stride, etc. + * + * + * Note: this could be done with "gather" instructions if they are + * available. (like in avx2, but not in avx) + */ + void load_n(element_type const *ptr, size_t len, size_t stride){ + if(len == s_num_elem){ + load(ptr, stride); + } + else{ + m_length = len; + for(size_t i = 0;i < len;++ i){ + set(i, ptr[i*stride]); + } + } + } + + + /*! + * @brief Store operation, assuming scalars are in consecutive memory + * locations. + */ + void store(element_type *ptr) const{ + if(m_length == s_num_elem){ + for(size_t i = 0;i < s_num_registers;++ i){ + m_registers[i].store(ptr + i*s_num_register_elem); + } + } + else{ + for(size_t i = 0;i < m_length;++ i){ + ptr[i] = (*this)[i]; + } + } + } + + /*! + * @brief Strided store operation, where scalars are stored in memory + * locations ptr, ptr+stride, ptr+2*stride, etc. + * + * + * Note: this could be done with "scatter" instructions if they are + * available. + */ + void store(element_type *ptr, size_t stride) const{ + if(m_length == s_num_elem){ + for(size_t i = 0;i < s_num_registers;++ i){ + m_registers[i].store(ptr + i*s_num_register_elem*stride, stride); + } + } + else{ + for(size_t i = 0;i < m_length;++ i){ + ptr[i*stride] = (*this)[i]; + } + } + } + + + /*! + * @brief Assign one register to antoher + * @param x Vector to copy + * @return Value of (*this) + */ + RAJA_INLINE + self_type const &operator=(self_type const &x) + { + m_registers = x.m_registers; + m_length = x.m_length; + return *this; + } + + + + /*! + * @brief Assign one register from a scalar + * @param x Vector to copy + * @return Value of (*this) + */ + RAJA_INLINE + self_type const &operator=(element_type const &x) + { + m_length = s_num_elem; + for(size_t i = 0;i < s_num_registers;++ i){ + m_registers[i] = x; + } + return *this; + } + + + /*! + * @brief Add two vector registers + * @param x Vector to add to this register + * @return Value of (*this)+x + */ + RAJA_INLINE + self_type operator+(self_type const &x) const + { + self_type result = *this; + result += x; + return result; + } + + + /*! + * @brief Add a vector to this vector + * @param x Vector to add to this register + * @return Value of (*this)+x + */ + RAJA_INLINE + self_type const &operator+=(self_type const &x) + { + for(size_t i = 0;i < s_num_registers;++ i){ + m_registers[i] += x.m_registers[i]; + } + m_length = std::min(m_length, x.m_length); + return *this; + } + + + /*! + * @brief Subtract two vector registers + * @param x Vector to subctract from this register + * @return Value of (*this)+x + */ + RAJA_INLINE + self_type operator-(self_type const &x) const + { + self_type result = *this; + result -= x; + return result; + } + + /*! + * @brief Subtract a vector from this vector + * @param x Vector to subtract from this register + * @return Value of (*this)+x + */ + RAJA_INLINE + self_type const &operator-=(self_type const &x) + { + for(size_t i = 0;i < s_num_registers;++ i){ + m_registers[i] -= x.m_registers[i]; + } + m_length = std::min(m_length, x.m_length); + return *this; + } + + /*! + * @brief Multiply two vector registers, element wise + * @param x Vector to subctract from this register + * @return Value of (*this)+x + */ + RAJA_INLINE + self_type operator*(self_type const &x) const + { + self_type result = *this; + result *= x; + return result; + } + + /*! + * @brief Multiply a vector with this vector + * @param x Vector to multiple with this register + * @return Value of (*this)+x + */ + RAJA_INLINE + self_type const &operator*=(self_type const &x) + { + for(size_t i = 0;i < s_num_registers;++ i){ + m_registers[i] *= x.m_registers[i]; + } + m_length = std::min(m_length, x.m_length); + return *this; + } + + /*! + * @brief Divide two vector registers, element wise + * @param x Vector to subctract from this register + * @return Value of (*this)+x + */ + RAJA_INLINE + self_type operator/(self_type const &x) const + { + self_type result = *this; + result /= x; + return result; + } + + /*! + * @brief Divide this vector by another vector + * @param x Vector to divide by + * @return Value of (*this)+x + */ + RAJA_INLINE + self_type const &operator/=(self_type const &x) + { + for(size_t i = 0;i < s_num_registers;++ i){ + m_registers[i] /= x.m_registers[i]; + } + m_length = std::min(m_length, x.m_length); + return *this; + } + + /*! + * @brief Sum the elements of this vector + * @return Sum of the values of the vectors scalar elements + */ + RAJA_INLINE + element_type sum() const + { + element_type result = (element_type)0; + if(m_length == s_num_elem){ + for(size_t i = 0;i < s_num_registers;++ i){ + result += m_registers[i].sum(); + } + } + else{ + for(size_t i = 0;i < m_length;++ i){ + result += (*this)[i]; + } + } + return result; + } + + + /*! + * @brief Dot product of two vectors + * @param x Other vector to dot with this vector + * @return Value of (*this) dot x + */ + RAJA_INLINE + element_type dot(self_type const &x) const + { + self_type z = (*this) * x; + return z.sum(); + } + + + + /*! + * @brief Returns the largest element + * @return The largest scalar element in the register + */ + RAJA_INLINE + element_type max() const + { + if(m_length == s_num_elem){ + element_type result = m_registers[0].max(); + for(size_t i = 1;i < s_num_registers;++ i){ + result = std::max(result, m_registers[i].max()); + } + return result; + } + else{ + element_type result = (*this)[0]; + for(size_t i = 0;i < m_length;++ i){ + result = std::max(result, (*this)[i]); + } + return result; + } + } + + + + /*! + * @brief Returns the largest element + * @return The largest scalar element in the register + */ + RAJA_INLINE + element_type min() const + { + if(m_length == s_num_elem){ + element_type result = m_registers[0].min(); + for(size_t i = 1;i < s_num_registers;++ i){ + result = std::min(result, m_registers[i].min()); + } + return result; + } + else{ + element_type result = (*this)[0]; + for(size_t i = 0;i < m_length;++ i){ + result = std::min(result, (*this)[i]); + } + return result; + } + } + + }; + +} // namespace RAJA + + +#endif diff --git a/include/RAJA/policy/sequential.hpp b/include/RAJA/policy/sequential.hpp index b88fb5f2b4..9f858ba91d 100644 --- a/include/RAJA/policy/sequential.hpp +++ b/include/RAJA/policy/sequential.hpp @@ -25,6 +25,7 @@ #include "RAJA/policy/sequential/kernel.hpp" #include "RAJA/policy/sequential/policy.hpp" #include "RAJA/policy/sequential/reduce.hpp" +#include "RAJA/policy/sequential/register.hpp" #include "RAJA/policy/sequential/scan.hpp" diff --git a/include/RAJA/policy/simd.hpp b/include/RAJA/policy/simd.hpp index 40ae5ff661..c7a1726a76 100644 --- a/include/RAJA/policy/simd.hpp +++ b/include/RAJA/policy/simd.hpp @@ -22,6 +22,7 @@ #include "RAJA/policy/simd/forall.hpp" #include "RAJA/policy/simd/policy.hpp" +#include "RAJA/policy/simd/register.hpp" #include "RAJA/policy/simd/kernel/For.hpp" #include "RAJA/policy/simd/kernel/ForICount.hpp" diff --git a/include/RAJA/policy/simd/forall.hpp b/include/RAJA/policy/simd/forall.hpp index e5a1e22c1f..c66dba322a 100644 --- a/include/RAJA/policy/simd/forall.hpp +++ b/include/RAJA/policy/simd/forall.hpp @@ -59,6 +59,53 @@ RAJA_INLINE void forall_impl(const simd_exec &, } } + +template +RAJA_INLINE void forall_impl(const simd_fixed_exec &, + Iterable &&iter, + Func &&loop_body) +{ + auto begin = std::begin(iter); + auto end = std::end(iter); + auto distance = std::distance(begin, end); + + using index_type = camp::decay; + using simd_index_type = FixedRegisterIndex; + + for (decltype(distance) i = 0; i < distance; i+=Value::s_num_elem) { + loop_body(simd_index_type(*(begin + i))); + } +} + + +template +RAJA_INLINE void forall_impl(const simd_stream_exec &, + Iterable &&iter, + Func &&loop_body) +{ + auto begin = std::begin(iter); + auto end = std::end(iter); + auto distance = std::distance(begin, end); + + auto distance_simd = distance - (distance%Value::s_num_elem); + auto distance_remainder = distance - distance_simd; + + using index_type = camp::decay; + using simd_index_type = StreamRegisterIndex; + + // Streaming SIMD loop for complete elements + for (decltype(distance) i = 0; i < distance_simd; i+=Value::s_num_elem) { + loop_body(simd_index_type(*(begin + i), Value::s_num_elem)); + } + + // Postamble for reamining elements + if(distance_remainder > 0){ + loop_body(simd_index_type(*(begin + distance_simd), distance_remainder)); + } + +} + + } // namespace simd } // namespace policy diff --git a/include/RAJA/policy/simd/policy.hpp b/include/RAJA/policy/simd/policy.hpp index 73ecf7ffdf..d10d4e9e0d 100644 --- a/include/RAJA/policy/simd/policy.hpp +++ b/include/RAJA/policy/simd/policy.hpp @@ -44,6 +44,79 @@ struct simd_exec : make_policy_pattern_launch_platform_t { }; +template +struct simd_fixed_exec : make_policy_pattern_launch_platform_t { + + using value_type = VALUE_TYPE; +}; + +template +struct simd_stream_exec : make_policy_pattern_launch_platform_t { + + using value_type = VALUE_TYPE; +}; + + + +template +class FixedRegisterIndex { + public: + using index_type = IDX; + using register_type = REGISTER; + + RAJA_INLINE + FixedRegisterIndex() : m_value(0) {} + + RAJA_INLINE + explicit FixedRegisterIndex(index_type value) : m_value(value) {} + + RAJA_INLINE + constexpr + index_type operator*() const { + return m_value; + } + + private: + index_type m_value; +}; + + +template +class StreamRegisterIndex { + public: + using index_type = IDX; + using register_type = REGISTER; + + RAJA_INLINE + StreamRegisterIndex() : m_value(0), m_length(REGISTER::s_num_elem) {} + + RAJA_INLINE + explicit StreamRegisterIndex(index_type value, size_t length) : m_value(value), m_length(length) {} + + RAJA_INLINE + constexpr + index_type operator*() const { + return m_value; + } + + RAJA_INLINE + constexpr + size_t size() const { + return m_length; + } + + private: + index_type m_value; + size_t m_length; +}; + + struct simd_register{}; } // end of namespace simd @@ -51,6 +124,10 @@ struct simd_register{}; } // end of namespace policy using policy::simd::simd_exec; +using policy::simd::simd_fixed_exec; +using policy::simd::simd_stream_exec; +using policy::simd::FixedRegisterIndex; +using policy::simd::StreamRegisterIndex; using policy::simd::simd_register; } // end of namespace RAJA diff --git a/test/unit/test-vector.cpp b/test/unit/test-vector.cpp index 0845f5475b..c256ab07ac 100644 --- a/test/unit/test-vector.cpp +++ b/test/unit/test-vector.cpp @@ -15,21 +15,23 @@ #include "RAJA/pattern/register.hpp" #include "RAJA/pattern/vector.hpp" -namespace RAJA{ -template -using SimdRegister = Register; -} - -using RegisterTestTypes = ::testing::Types, - RAJA::SimdRegister, - RAJA::SimdRegister, - RAJA::SimdRegister, - RAJA::SimdRegister, - RAJA::SimdRegister, - RAJA::FixedVector, 27>, - RAJA::FixedVector, 27>, - RAJA::FixedVector, 27>, - RAJA::FixedVector, 27>>; +#if 0 + +using RegisterTestTypes = ::testing::Types< + RAJA::Register, + RAJA::Register, + RAJA::Register, + RAJA::Register, + RAJA::Register, + RAJA::Register, + RAJA::FixedVector, 27>, + RAJA::FixedVector, 27>, + RAJA::FixedVector, 27>, + RAJA::FixedVector, 27>, + RAJA::StreamVector, 4>, + RAJA::StreamVector, 8>, + RAJA::StreamVector, 12>, + RAJA::StreamVector, 16>>; //usingRegister TestTypes = ::testing::Types, 27>>; @@ -325,3 +327,174 @@ REGISTER_TYPED_TEST_CASE_P(RegisterTest, SimdRegisterSetGet, SimdRegisterMin); INSTANTIATE_TYPED_TEST_CASE_P(SIMD, RegisterTest, RegisterTestTypes); + + + + +TEST(StreamVectorTest, Test1) +{ + using TypeParam = RAJA::StreamVector, 8>; + using register_t = TypeParam; + + using element_t = typename register_t::element_type; + static constexpr size_t num_elem = register_t::s_num_elem; + + element_t A[num_elem], B[num_elem]; + for(size_t i = 0;i < num_elem; ++ i){ + A[i] = (element_t)(drand48()*1000.0); + } + + + for(size_t N = 1;N <= 8;++ N){ + for(size_t i = 0;i < 8;++ i){ + B[i] = 0; + } + + register_t x, y; + x.load_n(A, N); + y = 3.0; + x = x+y; + x.store(B); + + for(size_t i = 0;i < 8;++ i){ + if(i < N){ + ASSERT_DOUBLE_EQ(B[i], A[i]+3.0); + } + else + { + ASSERT_DOUBLE_EQ(B[i], 0.0); + } + } + } +} + +TEST(StreamVectorTest, TestStreamLoop) +{ + using TypeParam = RAJA::StreamVector, 32>; + using register_t = TypeParam; + + using element_t = typename register_t::element_type; + static constexpr size_t num_elem = register_t::s_num_elem; + + // Use drand48 to change sizes of everything: this ensures that the compiler + // cannot optimize out sizes (and do more optimization than we want) + size_t N = 8000 + (100*drand48()); + + element_t *A = new element_t[N]; + element_t *B = new element_t[N]; + element_t *C = new element_t[N]; + for(size_t i = 0;i < N; ++ i){ + A[i] = (element_t)(drand48()*1000.0); + B[i] = (element_t)(drand48()*1000.0); + } + + size_t Nsimd = N - (N%num_elem); + size_t Nrem = N - Nsimd; + for(size_t i = 0;i < Nsimd;i += num_elem){ + register_t x,y; + x.load_n(&A[i],num_elem); + y.load_n(&B[i],num_elem); + + register_t z = x*y; + z.store(&C[i]); + } + if(Nrem > 0){ + register_t x,y; + x.load_n(&A[Nsimd], Nrem); + y.load_n(&B[Nsimd], Nrem); + + register_t z = x*y; + z.store(&C[Nsimd]); + } + + for(size_t i = 0;i < N;i ++){ + ASSERT_DOUBLE_EQ(A[i]*B[i], C[i]); + } + + delete[] A; + delete[] B; + delete[] C; +} + +TEST(StreamVectorTest, TestFixedForall) +{ + using TypeParam = RAJA::FixedVector, 8>; + using register_t = TypeParam; + + using element_t = typename register_t::element_type; + static constexpr size_t num_elem = register_t::s_num_elem; + + + size_t N = 1024*num_elem; + + element_t *A = new element_t[N]; + element_t *B = new element_t[N]; + element_t *C = new element_t[N]; + for(size_t i = 0;i < N; ++ i){ + A[i] = (element_t)(drand48()*1000.0); + B[i] = (element_t)(drand48()*1000.0); + } + + using policy_t = RAJA::simd_fixed_exec; + + RAJA::forall(RAJA::TypedRangeSegment(0, N), + [=](RAJA::FixedRegisterIndex i) + { + register_t x,y; + x.load(&A[*i]); + y.load(&B[*i]); + + register_t z = x*y; + z.store(&C[*i]); + }); + + + for(size_t i = 0;i < N;i ++){ + ASSERT_DOUBLE_EQ(A[i]*B[i], C[i]); + } + + delete[] A; + delete[] B; + delete[] C; +} +#endif +TEST(StreamVectorTest, TestStreamForall) +{ + using TypeParam = RAJA::StreamVector, 8>; + using register_t = TypeParam; + + using element_t = typename register_t::element_type; + + + size_t N = 8000 + (100*drand48()); + + element_t *A = new element_t[N]; + element_t *B = new element_t[N]; + element_t *C = new element_t[N]; + for(size_t i = 0;i < N; ++ i){ + A[i] = (element_t)(drand48()*1000.0); + B[i] = (element_t)(drand48()*1000.0); + } + + using policy_t = RAJA::simd_stream_exec; + + RAJA::forall(RAJA::TypedRangeSegment(0, N), + [=](RAJA::StreamRegisterIndex i) + { + register_t x,y; + x.load_n(&A[*i], i.size()); + y.load_n(&B[*i], i.size()); + + register_t z = x*y; + z.store(&C[*i]); + }); + + + for(size_t i = 0;i < N;i ++){ + ASSERT_DOUBLE_EQ(A[i]*B[i], C[i]); + } + + delete[] A; + delete[] B; + delete[] C; +} From a49a43003c698175217de88baf7fa5fcd6eef3d1 Mon Sep 17 00:00:00 2001 From: "Adam J. Kunen" Date: Thu, 7 Nov 2019 14:49:42 -0800 Subject: [PATCH 005/593] Implemented working VectorRef that will (eventually) enable using RAJA::View to read/write vectors --- include/RAJA/pattern/vector.hpp | 1 + include/RAJA/pattern/vector/FixedVector.hpp | 14 + include/RAJA/pattern/vector/StreamVector.hpp | 10 + include/RAJA/pattern/vector/VectorRef.hpp | 341 ++++++++++++++++++ include/RAJA/policy/simd/policy.hpp | 2 + .../RAJA/policy/simd/register/avx_double2.hpp | 7 + .../RAJA/policy/simd/register/avx_double3.hpp | 7 + .../RAJA/policy/simd/register/avx_double4.hpp | 7 + test/unit/test-vector.cpp | 44 ++- 9 files changed, 432 insertions(+), 1 deletion(-) create mode 100644 include/RAJA/pattern/vector/VectorRef.hpp diff --git a/include/RAJA/pattern/vector.hpp b/include/RAJA/pattern/vector.hpp index 6039e73aef..3d86f4a6ed 100644 --- a/include/RAJA/pattern/vector.hpp +++ b/include/RAJA/pattern/vector.hpp @@ -20,6 +20,7 @@ #include "RAJA/pattern/vector/FixedVector.hpp" #include "RAJA/pattern/vector/StreamVector.hpp" +#include "RAJA/pattern/vector/VectorRef.hpp" #endif diff --git a/include/RAJA/pattern/vector/FixedVector.hpp b/include/RAJA/pattern/vector/FixedVector.hpp index 10fc3878c4..423936ca7c 100644 --- a/include/RAJA/pattern/vector/FixedVector.hpp +++ b/include/RAJA/pattern/vector/FixedVector.hpp @@ -87,6 +87,20 @@ namespace RAJA m_partial_register(c.m_partial_register) {} + /*! + * @brief Scalar constructor (broadcast) + */ + RAJA_INLINE + FixedVector(element_type const &c) + { + for(size_t i = 0;i < s_num_full_registers;++ i){ + m_full_registers[i] = c; + } + if(s_num_partial_registers){ + m_partial_register[0] = c; + } + } + /*! * @brief Load constructor, assuming scalars are in consecutive memory diff --git a/include/RAJA/pattern/vector/StreamVector.hpp b/include/RAJA/pattern/vector/StreamVector.hpp index d7e8fe0ffb..e9bb61d2d2 100644 --- a/include/RAJA/pattern/vector/StreamVector.hpp +++ b/include/RAJA/pattern/vector/StreamVector.hpp @@ -76,6 +76,16 @@ namespace RAJA m_length(c.m_length) {} + /*! + * @brief Scalar constructor (broadcast) + */ + RAJA_INLINE + StreamVector(element_type const &c) : m_length(s_num_elem) + { + for(size_t i = 0;i < s_num_registers;++ i){ + m_registers[i] = c; + } + } /*! * @brief Get scalar value from vector diff --git a/include/RAJA/pattern/vector/VectorRef.hpp b/include/RAJA/pattern/vector/VectorRef.hpp new file mode 100644 index 0000000000..b0101221e9 --- /dev/null +++ b/include/RAJA/pattern/vector/VectorRef.hpp @@ -0,0 +1,341 @@ +/*! + ****************************************************************************** + * + * \file + * + * \brief RAJA header file defining SIMD/SIMT register operations. + * + ****************************************************************************** + */ + +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-19, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#ifndef RAJA_pattern_vector_vectorref_HPP +#define RAJA_pattern_vector_vectorref_HPP + +#include "RAJA/config.hpp" + +#include "RAJA/util/macros.hpp" + +#include + +namespace RAJA +{ + + +/*! + * \file + * Vector operation functions in the namespace RAJA + + * + */ + + template + class VectorRef { + public: + using self_type = VectorRef; + using register_index_type = REGISTER_INDEX; + using register_type = typename register_index_type::register_type; + using element_type = typename register_type::element_type; + using pointer_type = POINTER_TYPE; + + private: + register_index_type m_index; + pointer_type m_data; + size_t m_stride; + + public: + + + /*! + * @brief Default constructor, zeros register contents + */ + RAJA_INLINE + VectorRef() : m_index(), m_data() {}; + + /*! + * @brief Stride-1 constructor + */ + RAJA_INLINE + VectorRef(register_index_type index, pointer_type pointer) : + m_index(index), + m_data(pointer), + m_stride(1) + {} + + + /*! + * @brief Strided constructor + */ + RAJA_INLINE + VectorRef(register_index_type index, pointer_type pointer, size_t stride) : + m_index(index), + m_data(pointer), + m_stride(stride) + {} + + /*! + * @brief Copy constructor + */ + RAJA_INLINE + VectorRef(self_type const &c) : + m_index(c.m_index), + m_data(c.m_data) + {} + + + /*! + * @brief Set entire vector to a single scalar value + * @param value Value to set all vector elements to + */ + RAJA_INLINE + void store(register_type value) const + { + if(STRIDE_ONE){ + value.store(m_data+*m_index); + } + else{ + value.store(m_data+*m_index, m_stride); + } + } + + /*! + * @brief Set entire vector to a single scalar value + * @param value Value to set all vector elements to + */ + RAJA_INLINE + register_type load() const + { + register_type value; + if(STRIDE_ONE){ + value.load(m_data+*m_index); + } + else{ + value.load(m_data+*m_index, m_stride); + } + return value; + } + + RAJA_INLINE + operator register_type() const { + return load(); + } + + + /*! + * @brief Set entire vector to a single scalar value + * @param value Value to set all vector elements to + */ + RAJA_INLINE + self_type const &operator=(register_type value) + { + store(value); + return *this; + } + + + /*! + * @brief Get scalar value from vector register + * @param i Offset of scalar to get + * @return Returns scalar value at i + */ + template + constexpr + RAJA_INLINE + element_type operator[](IDX i) const + { + return load()[i]; + } + + + /*! + * @brief Set scalar value in vector register + * @param i Offset of scalar to set + * @param value Value of scalar to set + */ + template + RAJA_INLINE + void set(IDX i, element_type value) + { + register_type x = load(); + x[i] = value; + store(x); + } + + /*! + * @brief Set entire vector to a single scalar value + * @param value Value to set all vector elements to + */ + RAJA_INLINE + self_type const &operator=(element_type value) + { + register_type x = value; + store(x); + return *this; + } + + + /*! + * @brief Add two vector registers + * @param x Vector to add to this register + * @return Value of (*this)+x + */ + RAJA_INLINE + register_type operator+(register_type const &x) const + { + return load() + x; + } + + /*! + * @brief Add a vector to this vector + * @param x Vector to add to this register + * @return Value of (*this)+x + */ + RAJA_INLINE + self_type const &operator+=(register_type const &x) + { + store(load() + x); + return *this; + } + + /*! + * @brief Subtract two vector registers + * @param x Vector to subctract from this register + * @return Value of (*this)+x + */ + RAJA_INLINE + register_type operator-(register_type const &x) const + { + return load() - x; + } + + /*! + * @brief Subtract a vector from this vector + * @param x Vector to subtract from this register + * @return Value of (*this)+x + */ + RAJA_INLINE + self_type const &operator-=(register_type const &x) + { + store(load() - x); + return *this; + } + + /*! + * @brief Multiply two vector registers, element wise + * @param x Vector to subctract from this register + * @return Value of (*this)+x + */ + RAJA_INLINE + register_type operator*(register_type const &x) const + { + return load() * x; + } + + /*! + * @brief Multiply a vector with this vector + * @param x Vector to multiple with this register + * @return Value of (*this)+x + */ + RAJA_INLINE + self_type const &operator*=(register_type const &x) + { + store(load() * x); + return *this; + } + + /*! + * @brief Divide two vector registers, element wise + * @param x Vector to subctract from this register + * @return Value of (*this)+x + */ + RAJA_INLINE + register_type operator/(register_type const &x) const + { + return load() / x; + } + + /*! + * @brief Divide this vector by another vector + * @param x Vector to divide by + * @return Value of (*this)+x + */ + RAJA_INLINE + self_type const &operator/=(register_type const &x) + { + store(load() / x); + return *this; + } + + /*! + * @brief Sum the elements of this vector + * @return Sum of the values of the vectors scalar elements + */ + RAJA_INLINE + element_type sum() const + { + return load().sum(); + } + + /*! + * @brief Dot product of two vectors + * @param x Other vector to dot with this vector + * @return Value of (*this) dot x + */ + RAJA_INLINE + element_type dot(register_type const &x) const + { + return load().dot(x); + } + + /*! + * @brief Returns the largest element + * @return The largest scalar element in the register + */ + RAJA_INLINE + element_type max() const + { + return load().max(); + } + + /*! + * @brief Returns element-wise largest values + * @return Vector of the element-wise max values + */ + RAJA_INLINE + register_type vmax(register_type a) const + { + return load().vmax(a); + } + + /*! + * @brief Returns the largest element + * @return The largest scalar element in the register + */ + RAJA_INLINE + element_type min() const + { + return load().min(); + } + + /*! + * @brief Returns element-wise largest values + * @return Vector of the element-wise max values + */ + RAJA_INLINE + register_type vmin(register_type a) const + { + return load().vmin(a); + } + + }; + +} // namespace RAJA + + +#endif diff --git a/include/RAJA/policy/simd/policy.hpp b/include/RAJA/policy/simd/policy.hpp index d10d4e9e0d..6ca059d7c4 100644 --- a/include/RAJA/policy/simd/policy.hpp +++ b/include/RAJA/policy/simd/policy.hpp @@ -117,6 +117,8 @@ class StreamRegisterIndex { }; + + struct simd_register{}; } // end of namespace simd diff --git a/include/RAJA/policy/simd/register/avx_double2.hpp b/include/RAJA/policy/simd/register/avx_double2.hpp index 9ed7b48a68..9b24d98b4d 100644 --- a/include/RAJA/policy/simd/register/avx_double2.hpp +++ b/include/RAJA/policy/simd/register/avx_double2.hpp @@ -64,6 +64,13 @@ namespace RAJA */ Register(self_type const &c) : m_value(c.m_value) {} + /*! + * @brief Construct from scalar. + * Sets all elements to same value (broadcast). + */ + RAJA_INLINE + Register(element_type const &c) : m_value(_mm_set1_pd(c)) {} + /*! * @brief Load operation, assuming scalars are in consecutive memory * locations. diff --git a/include/RAJA/policy/simd/register/avx_double3.hpp b/include/RAJA/policy/simd/register/avx_double3.hpp index e76124cfb4..37d7ea7230 100644 --- a/include/RAJA/policy/simd/register/avx_double3.hpp +++ b/include/RAJA/policy/simd/register/avx_double3.hpp @@ -74,6 +74,13 @@ namespace RAJA RAJA_INLINE Register(self_type const &c) : m_value(c.m_value) {} + /*! + * @brief Construct from scalar. + * Sets all elements to same value (broadcast). + */ + RAJA_INLINE + Register(element_type const &c) : m_value(_mm256_set1_pd(c)) {} + /*! * @brief Load constructor, assuming scalars are in consecutive memory * locations. diff --git a/include/RAJA/policy/simd/register/avx_double4.hpp b/include/RAJA/policy/simd/register/avx_double4.hpp index 228db7b1e8..5216de2ad3 100644 --- a/include/RAJA/policy/simd/register/avx_double4.hpp +++ b/include/RAJA/policy/simd/register/avx_double4.hpp @@ -68,6 +68,13 @@ namespace RAJA Register(self_type const &c) : m_value(c.m_value) {} + /*! + * @brief Construct from scalar. + * Sets all elements to same value (broadcast). + */ + RAJA_INLINE + Register(element_type const &c) : m_value(_mm256_set1_pd(c)) {} + /*! * @brief Load constructor, assuming scalars are in consecutive memory * locations. diff --git a/test/unit/test-vector.cpp b/test/unit/test-vector.cpp index c256ab07ac..81755875fc 100644 --- a/test/unit/test-vector.cpp +++ b/test/unit/test-vector.cpp @@ -457,7 +457,7 @@ TEST(StreamVectorTest, TestFixedForall) delete[] B; delete[] C; } -#endif + TEST(StreamVectorTest, TestStreamForall) { using TypeParam = RAJA::StreamVector, 8>; @@ -498,3 +498,45 @@ TEST(StreamVectorTest, TestStreamForall) delete[] B; delete[] C; } +#endif + +TEST(StreamVectorTest, TestStreamForallRef) +{ + using TypeParam = RAJA::StreamVector, 8>; + using register_t = TypeParam; + + using element_t = typename register_t::element_type; + + + size_t N = 8000 + (100*drand48()); + + element_t *A = new element_t[N]; + element_t *B = new element_t[N]; + element_t *C = new element_t[N]; + for(size_t i = 0;i < N; ++ i){ + A[i] = (element_t)(drand48()*1000.0); + B[i] = (element_t)(drand48()*1000.0); + C[i] = 0.0; + } + + using policy_t = RAJA::simd_stream_exec; + + RAJA::forall(RAJA::TypedRangeSegment(0, N), + [=](RAJA::StreamRegisterIndex i) + { + RAJA::VectorRef, double*, true> x(i, A); + RAJA::VectorRef, double*, true> y(i, B); + RAJA::VectorRef, double*, true> z(i, C); + z = (x*y)+3; + + }); + + + for(size_t i = 0;i < N;i ++){ + ASSERT_DOUBLE_EQ(A[i]*B[i]+3, C[i]); + } + + delete[] A; + delete[] B; + delete[] C; +} From 363f99f760807b269c936cddc71ab9bfc196426f Mon Sep 17 00:00:00 2001 From: "Adam J. Kunen" Date: Thu, 7 Nov 2019 15:08:50 -0800 Subject: [PATCH 006/593] Added ability to have scalars on the left-hand side of operators with vectors --- include/RAJA/pattern/register.hpp | 25 ++++++++++++++++ include/RAJA/pattern/vector/FixedVector.hpp | 26 +++++++++++++++++ include/RAJA/pattern/vector/StreamVector.hpp | 25 ++++++++++++++++ include/RAJA/pattern/vector/VectorRef.hpp | 30 ++++++++++++++++++++ test/unit/test-vector.cpp | 5 ++-- 5 files changed, 109 insertions(+), 2 deletions(-) diff --git a/include/RAJA/pattern/register.hpp b/include/RAJA/pattern/register.hpp index 561b8c9bf2..b6567d1b46 100644 --- a/include/RAJA/pattern/register.hpp +++ b/include/RAJA/pattern/register.hpp @@ -37,6 +37,31 @@ namespace RAJA class Register; + + template + Register + operator+(ST x, Register const &y){ + return Register(x) + y; + } + + template + Register + operator-(ST x, Register const &y){ + return Register(x) - y; + } + + template + Register + operator*(ST x, Register const &y){ + return Register(x) * y; + } + + template + Register + operator/(ST x, Register const &y){ + return Register(x) / y; + } + } // namespace RAJA diff --git a/include/RAJA/pattern/vector/FixedVector.hpp b/include/RAJA/pattern/vector/FixedVector.hpp index 423936ca7c..ad1be3047e 100644 --- a/include/RAJA/pattern/vector/FixedVector.hpp +++ b/include/RAJA/pattern/vector/FixedVector.hpp @@ -478,6 +478,32 @@ namespace RAJA }; + + + template + FixedVector + operator+(ST x, FixedVector const &y){ + return FixedVector(x) + y; + } + + template + FixedVector + operator-(ST x, FixedVector const &y){ + return FixedVector(x) - y; + } + + template + FixedVector + operator*(ST x, FixedVector const &y){ + return FixedVector(x) * y; + } + + template + FixedVector + operator/(ST x, FixedVector const &y){ + return FixedVector(x) / y; + } + } // namespace RAJA diff --git a/include/RAJA/pattern/vector/StreamVector.hpp b/include/RAJA/pattern/vector/StreamVector.hpp index e9bb61d2d2..20a53d0401 100644 --- a/include/RAJA/pattern/vector/StreamVector.hpp +++ b/include/RAJA/pattern/vector/StreamVector.hpp @@ -460,6 +460,31 @@ namespace RAJA }; + + template + StreamVector + operator+(ST x, StreamVector const &y){ + return StreamVector(x) + y; + } + + template + StreamVector + operator-(ST x, StreamVector const &y){ + return StreamVector(x) - y; + } + + template + StreamVector + operator*(ST x, StreamVector const &y){ + return StreamVector(x) * y; + } + + template + StreamVector + operator/(ST x, StreamVector const &y){ + return StreamVector(x) / y; + } + } // namespace RAJA diff --git a/include/RAJA/pattern/vector/VectorRef.hpp b/include/RAJA/pattern/vector/VectorRef.hpp index b0101221e9..710ff104cc 100644 --- a/include/RAJA/pattern/vector/VectorRef.hpp +++ b/include/RAJA/pattern/vector/VectorRef.hpp @@ -335,6 +335,36 @@ namespace RAJA }; + + template + typename REGISTER_INDEX::register_type + operator+(ST x, VectorRef const &y){ + using register_type = typename REGISTER_INDEX::register_type; + return register_type(x) + y.load(); + } + + template + typename REGISTER_INDEX::register_type + operator-(ST x, VectorRef const &y){ + using register_type = typename REGISTER_INDEX::register_type; + return register_type(x) - y.load(); + } + + template + typename REGISTER_INDEX::register_type + operator*(ST x, VectorRef const &y){ + using register_type = typename REGISTER_INDEX::register_type; + return register_type(x) * y.load(); + } + + template + typename REGISTER_INDEX::register_type + operator/(ST x, VectorRef const &y){ + using register_type = typename REGISTER_INDEX::register_type; + return register_type(x) / y.load(); + } + + } // namespace RAJA diff --git a/test/unit/test-vector.cpp b/test/unit/test-vector.cpp index 81755875fc..3f5bdc1f90 100644 --- a/test/unit/test-vector.cpp +++ b/test/unit/test-vector.cpp @@ -527,13 +527,14 @@ TEST(StreamVectorTest, TestStreamForallRef) RAJA::VectorRef, double*, true> x(i, A); RAJA::VectorRef, double*, true> y(i, B); RAJA::VectorRef, double*, true> z(i, C); - z = (x*y)+3; + + z = 3+(x*(5/y))+9; }); for(size_t i = 0;i < N;i ++){ - ASSERT_DOUBLE_EQ(A[i]*B[i]+3, C[i]); + ASSERT_DOUBLE_EQ(3+(A[i]*(5/B[i]))+9, C[i]); } delete[] A; From fc9cdf321861255b49010dc8541987c8fa7d2168 Mon Sep 17 00:00:00 2001 From: "Adam J. Kunen" Date: Thu, 7 Nov 2019 16:04:04 -0800 Subject: [PATCH 007/593] Got View::operator[] working to accept StreamRegisterIndex<> types, and return a VectorRef object --- include/RAJA/RAJA.hpp | 5 +++++ include/RAJA/policy/simd/policy.hpp | 5 +++++ include/RAJA/util/View.hpp | 18 ++++++++++++++++++ test/unit/test-vector.cpp | 11 +++++------ 4 files changed, 33 insertions(+), 6 deletions(-) diff --git a/include/RAJA/RAJA.hpp b/include/RAJA/RAJA.hpp index c6538aa117..c4fc33a5bb 100644 --- a/include/RAJA/RAJA.hpp +++ b/include/RAJA/RAJA.hpp @@ -44,6 +44,11 @@ #include "RAJA/pattern/forall.hpp" #include "RAJA/pattern/kernel.hpp" +// +// Generic templates to describe SIMD/SIMT registers and vectors +// +#include "RAJA/pattern/register.hpp" +#include "RAJA/pattern/vector.hpp" // // All platforms must support sequential execution. diff --git a/include/RAJA/policy/simd/policy.hpp b/include/RAJA/policy/simd/policy.hpp index 6ca059d7c4..e8904e7889 100644 --- a/include/RAJA/policy/simd/policy.hpp +++ b/include/RAJA/policy/simd/policy.hpp @@ -99,6 +99,11 @@ class StreamRegisterIndex { RAJA_INLINE explicit StreamRegisterIndex(index_type value, size_t length) : m_value(value), m_length(length) {} + RAJA_INLINE + void set(index_type x) { + m_value = x; + } + RAJA_INLINE constexpr index_type operator*() const { diff --git a/include/RAJA/util/View.hpp b/include/RAJA/util/View.hpp index 374e4b562c..5ba6f59494 100644 --- a/include/RAJA/util/View.hpp +++ b/include/RAJA/util/View.hpp @@ -109,6 +109,24 @@ struct View { auto idx = stripIndexType(layout(args...)); return data[idx]; } + + // making this specifically typed would require unpacking the layout, + // this is easier to maintain + //RAJA::StreamRegisterIndex + //RAJA::VectorRef, double*, true> + template + RAJA_HOST_DEVICE RAJA_INLINE + VectorRef, pointer_type, true> + operator[](RAJA::StreamRegisterIndex arg) const + { + using idx_type = StreamRegisterIndex; + using ref_type = VectorRef; + + auto idx = stripIndexType(layout(*arg)); + + arg.set(Arg(idx)); + return ref_type(arg, data); + } }; template > X(A, N); + RAJA::View> Y(B, N); + RAJA::View> Z(C, N); + using policy_t = RAJA::simd_stream_exec; RAJA::forall(RAJA::TypedRangeSegment(0, N), [=](RAJA::StreamRegisterIndex i) { - RAJA::VectorRef, double*, true> x(i, A); - RAJA::VectorRef, double*, true> y(i, B); - RAJA::VectorRef, double*, true> z(i, C); - - z = 3+(x*(5/y))+9; - + Z[i] = 3+(X[i]*(5/Y[i]))+9; }); From cfefb5a164d83c9c9047837d189e4faaae278e7b Mon Sep 17 00:00:00 2001 From: "Adam J. Kunen" Date: Fri, 8 Nov 2019 09:44:33 -0800 Subject: [PATCH 008/593] Cleaned up the VectorRef and VectorIndex stuff --- include/RAJA/pattern/vector/VectorRef.hpp | 133 +++++++++++----------- include/RAJA/policy/simd/forall.hpp | 50 ++++---- include/RAJA/policy/simd/policy.hpp | 61 +++------- include/RAJA/util/View.hpp | 18 +-- test/unit/test-vector.cpp | 8 +- 5 files changed, 119 insertions(+), 151 deletions(-) diff --git a/include/RAJA/pattern/vector/VectorRef.hpp b/include/RAJA/pattern/vector/VectorRef.hpp index 710ff104cc..6b4bb9155d 100644 --- a/include/RAJA/pattern/vector/VectorRef.hpp +++ b/include/RAJA/pattern/vector/VectorRef.hpp @@ -35,17 +35,23 @@ namespace RAJA * */ - template + template class VectorRef { public: - using self_type = VectorRef; - using register_index_type = REGISTER_INDEX; - using register_type = typename register_index_type::register_type; - using element_type = typename register_type::element_type; + using self_type = + VectorRef; + + using vector_type = VECTOR_TYPE; + using index_type = INDEX_TYPE; using pointer_type = POINTER_TYPE; + using element_type = typename vector_type::element_type; + + private: - register_index_type m_index; + index_type m_linear_index; + index_type m_length; pointer_type m_data; size_t m_stride; @@ -56,36 +62,29 @@ namespace RAJA * @brief Default constructor, zeros register contents */ RAJA_INLINE - VectorRef() : m_index(), m_data() {}; - - /*! - * @brief Stride-1 constructor - */ - RAJA_INLINE - VectorRef(register_index_type index, pointer_type pointer) : - m_index(index), - m_data(pointer), - m_stride(1) - {} - + VectorRef() : m_linear_index(0), m_length(0), m_data(), m_stride(0) {}; /*! - * @brief Strided constructor + * @brief Constructor */ RAJA_INLINE - VectorRef(register_index_type index, pointer_type pointer, size_t stride) : - m_index(index), + VectorRef(index_type lin_index, index_type length, pointer_type pointer, index_type stride) : + m_linear_index(lin_index), + m_length(length), m_data(pointer), m_stride(stride) {} + /*! * @brief Copy constructor */ RAJA_INLINE VectorRef(self_type const &c) : - m_index(c.m_index), - m_data(c.m_data) + m_linear_index(c.m_linear_index), + m_length(c.m_length), + m_data(c.m_data), + m_stride(c.m_stride) {} @@ -94,13 +93,13 @@ namespace RAJA * @param value Value to set all vector elements to */ RAJA_INLINE - void store(register_type value) const + void store(vector_type value) const { if(STRIDE_ONE){ - value.store(m_data+*m_index); + value.store(m_data+m_linear_index); } else{ - value.store(m_data+*m_index, m_stride); + value.store(m_data+m_linear_index, m_stride); } } @@ -109,20 +108,26 @@ namespace RAJA * @param value Value to set all vector elements to */ RAJA_INLINE - register_type load() const + vector_type load() const { - register_type value; + vector_type value; if(STRIDE_ONE){ - value.load(m_data+*m_index); + value.load(m_data+m_linear_index); } else{ - value.load(m_data+*m_index, m_stride); + value.load(m_data+m_linear_index, m_stride); } return value; } + /*! + * @brief Automatic conversion to the underlying vector_type. + * + * This allows the use of a VectorRef in an expression, and lets the + * compiler automatically convert a VectorRef into a load(). + */ RAJA_INLINE - operator register_type() const { + operator vector_type() const { return load(); } @@ -132,7 +137,7 @@ namespace RAJA * @param value Value to set all vector elements to */ RAJA_INLINE - self_type const &operator=(register_type value) + self_type const &operator=(vector_type value) { store(value); return *this; @@ -145,7 +150,6 @@ namespace RAJA * @return Returns scalar value at i */ template - constexpr RAJA_INLINE element_type operator[](IDX i) const { @@ -162,7 +166,7 @@ namespace RAJA RAJA_INLINE void set(IDX i, element_type value) { - register_type x = load(); + vector_type x = load(); x[i] = value; store(x); } @@ -174,7 +178,7 @@ namespace RAJA RAJA_INLINE self_type const &operator=(element_type value) { - register_type x = value; + vector_type x = value; store(x); return *this; } @@ -186,7 +190,7 @@ namespace RAJA * @return Value of (*this)+x */ RAJA_INLINE - register_type operator+(register_type const &x) const + vector_type operator+(vector_type const &x) const { return load() + x; } @@ -197,7 +201,7 @@ namespace RAJA * @return Value of (*this)+x */ RAJA_INLINE - self_type const &operator+=(register_type const &x) + self_type const &operator+=(vector_type const &x) { store(load() + x); return *this; @@ -209,7 +213,7 @@ namespace RAJA * @return Value of (*this)+x */ RAJA_INLINE - register_type operator-(register_type const &x) const + vector_type operator-(vector_type const &x) const { return load() - x; } @@ -220,7 +224,7 @@ namespace RAJA * @return Value of (*this)+x */ RAJA_INLINE - self_type const &operator-=(register_type const &x) + self_type const &operator-=(vector_type const &x) { store(load() - x); return *this; @@ -232,7 +236,7 @@ namespace RAJA * @return Value of (*this)+x */ RAJA_INLINE - register_type operator*(register_type const &x) const + vector_type operator*(vector_type const &x) const { return load() * x; } @@ -243,7 +247,7 @@ namespace RAJA * @return Value of (*this)+x */ RAJA_INLINE - self_type const &operator*=(register_type const &x) + self_type const &operator*=(vector_type const &x) { store(load() * x); return *this; @@ -255,7 +259,7 @@ namespace RAJA * @return Value of (*this)+x */ RAJA_INLINE - register_type operator/(register_type const &x) const + vector_type operator/(vector_type const &x) const { return load() / x; } @@ -266,7 +270,7 @@ namespace RAJA * @return Value of (*this)+x */ RAJA_INLINE - self_type const &operator/=(register_type const &x) + self_type const &operator/=(vector_type const &x) { store(load() / x); return *this; @@ -288,7 +292,7 @@ namespace RAJA * @return Value of (*this) dot x */ RAJA_INLINE - element_type dot(register_type const &x) const + element_type dot(vector_type const &x) const { return load().dot(x); } @@ -308,7 +312,7 @@ namespace RAJA * @return Vector of the element-wise max values */ RAJA_INLINE - register_type vmax(register_type a) const + vector_type vmax(vector_type a) const { return load().vmax(a); } @@ -328,7 +332,7 @@ namespace RAJA * @return Vector of the element-wise max values */ RAJA_INLINE - register_type vmin(register_type a) const + vector_type vmin(vector_type a) const { return load().vmin(a); } @@ -336,32 +340,29 @@ namespace RAJA }; - template - typename REGISTER_INDEX::register_type - operator+(ST x, VectorRef const &y){ - using register_type = typename REGISTER_INDEX::register_type; - return register_type(x) + y.load(); + + template + VECTOR_TYPE + operator+(ST x, VectorRef const &y){ + return VECTOR_TYPE(x) + y.load(); } - template - typename REGISTER_INDEX::register_type - operator-(ST x, VectorRef const &y){ - using register_type = typename REGISTER_INDEX::register_type; - return register_type(x) - y.load(); + template + VECTOR_TYPE + operator-(ST x, VectorRef const &y){ + return VECTOR_TYPE(x) - y.load(); } - template - typename REGISTER_INDEX::register_type - operator*(ST x, VectorRef const &y){ - using register_type = typename REGISTER_INDEX::register_type; - return register_type(x) * y.load(); + template + VECTOR_TYPE + operator*(ST x, VectorRef const &y){ + return VECTOR_TYPE(x) * y.load(); } - template - typename REGISTER_INDEX::register_type - operator/(ST x, VectorRef const &y){ - using register_type = typename REGISTER_INDEX::register_type; - return register_type(x) / y.load(); + template + VECTOR_TYPE + operator/(ST x, VectorRef const &y){ + return VECTOR_TYPE(x) / y.load(); } diff --git a/include/RAJA/policy/simd/forall.hpp b/include/RAJA/policy/simd/forall.hpp index c66dba322a..3d876b4422 100644 --- a/include/RAJA/policy/simd/forall.hpp +++ b/include/RAJA/policy/simd/forall.hpp @@ -59,27 +59,27 @@ RAJA_INLINE void forall_impl(const simd_exec &, } } +// +//template +//RAJA_INLINE void forall_impl(const simd_vector_exec> &, +// Iterable &&iter, +// Func &&loop_body) +//{ +// auto begin = std::begin(iter); +// auto end = std::end(iter); +// auto distance = std::distance(begin, end); +// +// using index_type = camp::decay; +// using simd_index_type = FixedRegisterIndex; +// +// for (decltype(distance) i = 0; i < distance; i+=Value::s_num_elem) { +// loop_body(simd_index_type(*(begin + i))); +// } +//} +// -template -RAJA_INLINE void forall_impl(const simd_fixed_exec &, - Iterable &&iter, - Func &&loop_body) -{ - auto begin = std::begin(iter); - auto end = std::end(iter); - auto distance = std::distance(begin, end); - - using index_type = camp::decay; - using simd_index_type = FixedRegisterIndex; - - for (decltype(distance) i = 0; i < distance; i+=Value::s_num_elem) { - loop_body(simd_index_type(*(begin + i))); - } -} - - -template -RAJA_INLINE void forall_impl(const simd_stream_exec &, +template +RAJA_INLINE void forall_impl(const simd_vector_exec &, Iterable &&iter, Func &&loop_body) { @@ -87,15 +87,15 @@ RAJA_INLINE void forall_impl(const simd_stream_exec &, auto end = std::end(iter); auto distance = std::distance(begin, end); - auto distance_simd = distance - (distance%Value::s_num_elem); + auto distance_simd = distance - (distance%VectorType::s_num_elem); auto distance_remainder = distance - distance_simd; using index_type = camp::decay; - using simd_index_type = StreamRegisterIndex; + using simd_index_type = VectorIndex; - // Streaming SIMD loop for complete elements - for (decltype(distance) i = 0; i < distance_simd; i+=Value::s_num_elem) { - loop_body(simd_index_type(*(begin + i), Value::s_num_elem)); + // Streaming loop for complete vector widths + for (decltype(distance) i = 0; i < distance_simd; i+=VectorType::s_num_elem) { + loop_body(simd_index_type(*(begin + i), VectorType::s_num_elem)); } // Postamble for reamining elements diff --git a/include/RAJA/policy/simd/policy.hpp b/include/RAJA/policy/simd/policy.hpp index e8904e7889..3133b2121f 100644 --- a/include/RAJA/policy/simd/policy.hpp +++ b/include/RAJA/policy/simd/policy.hpp @@ -44,70 +44,37 @@ struct simd_exec : make_policy_pattern_launch_platform_t { }; -template -struct simd_fixed_exec : make_policy_pattern_launch_platform_t +struct simd_vector_exec : make_policy_pattern_launch_platform_t { - using value_type = VALUE_TYPE; + using vector_type = VECTOR_TYPE; }; -template -struct simd_stream_exec : make_policy_pattern_launch_platform_t { - using value_type = VALUE_TYPE; -}; -template -class FixedRegisterIndex { +template +class VectorIndex { public: using index_type = IDX; - using register_type = REGISTER; - - RAJA_INLINE - FixedRegisterIndex() : m_value(0) {} - - RAJA_INLINE - explicit FixedRegisterIndex(index_type value) : m_value(value) {} + using vector_type = VECTOR_TYPE; RAJA_INLINE constexpr - index_type operator*() const { - return m_value; - } - - private: - index_type m_value; -}; - - -template -class StreamRegisterIndex { - public: - using index_type = IDX; - using register_type = REGISTER; + VectorIndex() : m_index(0), m_length(vector_type::s_num_elem) {} RAJA_INLINE - StreamRegisterIndex() : m_value(0), m_length(REGISTER::s_num_elem) {} - - RAJA_INLINE - explicit StreamRegisterIndex(index_type value, size_t length) : m_value(value), m_length(length) {} - - RAJA_INLINE - void set(index_type x) { - m_value = x; - } + constexpr + VectorIndex(index_type value, size_t length) : m_index(value), m_length(length) {} RAJA_INLINE constexpr index_type operator*() const { - return m_value; + return m_index; } RAJA_INLINE @@ -117,7 +84,7 @@ class StreamRegisterIndex { } private: - index_type m_value; + index_type m_index; size_t m_length; }; @@ -131,10 +98,8 @@ struct simd_register{}; } // end of namespace policy using policy::simd::simd_exec; -using policy::simd::simd_fixed_exec; -using policy::simd::simd_stream_exec; -using policy::simd::FixedRegisterIndex; -using policy::simd::StreamRegisterIndex; +using policy::simd::simd_vector_exec; +using policy::simd::VectorIndex; using policy::simd::simd_register; } // end of namespace RAJA diff --git a/include/RAJA/util/View.hpp b/include/RAJA/util/View.hpp index 5ba6f59494..0128daa33d 100644 --- a/include/RAJA/util/View.hpp +++ b/include/RAJA/util/View.hpp @@ -52,6 +52,7 @@ struct View { using value_type = ValueType; using pointer_type = PointerType; using layout_type = LayoutType; + using linear_index_type = typename layout_type::IndexLinear; using nc_value_type = typename std::remove_const::type; using nc_pointer_type = typename std::add_pointer::type>::type>::type; @@ -60,6 +61,8 @@ struct View { layout_type const layout; pointer_type data; + + template RAJA_INLINE constexpr View(pointer_type data_ptr, Args... dim_sizes) : layout(dim_sizes...), data(data_ptr) @@ -116,16 +119,15 @@ struct View { //RAJA::VectorRef, double*, true> template RAJA_HOST_DEVICE RAJA_INLINE - VectorRef, pointer_type, true> - operator[](RAJA::StreamRegisterIndex arg) const + VectorRef + operator[](RAJA::VectorIndex arg) const { - using idx_type = StreamRegisterIndex; - using ref_type = VectorRef; - - auto idx = stripIndexType(layout(*arg)); + // Compute the linear index + linear_index_type idx = stripIndexType(layout(*arg)); - arg.set(Arg(idx)); - return ref_type(arg, data); + // Stuff it back into the index + using ref_type = VectorRef; + return ref_type(idx, REGISTER::s_num_elem, data, 1); } }; diff --git a/test/unit/test-vector.cpp b/test/unit/test-vector.cpp index d65a5868db..cc32b74b95 100644 --- a/test/unit/test-vector.cpp +++ b/test/unit/test-vector.cpp @@ -503,9 +503,9 @@ TEST(StreamVectorTest, TestStreamForall) TEST(StreamVectorTest, TestStreamForallRef) { using TypeParam = RAJA::StreamVector, 8>; - using register_t = TypeParam; + using vector_t = TypeParam; - using element_t = typename register_t::element_type; + using element_t = typename vector_t::element_type; size_t N = 8000 + (100*drand48()); @@ -523,10 +523,10 @@ TEST(StreamVectorTest, TestStreamForallRef) RAJA::View> Y(B, N); RAJA::View> Z(C, N); - using policy_t = RAJA::simd_stream_exec; + using policy_t = RAJA::simd_vector_exec; RAJA::forall(RAJA::TypedRangeSegment(0, N), - [=](RAJA::StreamRegisterIndex i) + [=](RAJA::VectorIndex i) { Z[i] = 3+(X[i]*(5/Y[i]))+9; }); From 5e82a8c2204f8ad758be2733fd119afcfdecc162 Mon Sep 17 00:00:00 2001 From: "Adam J. Kunen" Date: Fri, 8 Nov 2019 14:19:12 -0800 Subject: [PATCH 009/593] Some cleanup, split unit test files into register and vector tests --- include/RAJA/policy/simd/forall.hpp | 19 +- include/RAJA/policy/simd/policy.hpp | 3 - include/RAJA/policy/simd/register.hpp | 25 +- include/RAJA/policy/simd/register/avx.hpp | 30 ++ .../RAJA/policy/simd/register/avx_double2.hpp | 4 +- .../RAJA/policy/simd/register/avx_double3.hpp | 10 +- .../RAJA/policy/simd/register/avx_double4.hpp | 4 +- test/unit/CMakeLists.txt | 4 + test/unit/test-register.cpp | 329 ++++++++++++ test/unit/test-vector.cpp | 487 +----------------- 10 files changed, 412 insertions(+), 503 deletions(-) create mode 100644 include/RAJA/policy/simd/register/avx.hpp create mode 100644 test/unit/test-register.cpp diff --git a/include/RAJA/policy/simd/forall.hpp b/include/RAJA/policy/simd/forall.hpp index 3d876b4422..9d02bec2fc 100644 --- a/include/RAJA/policy/simd/forall.hpp +++ b/include/RAJA/policy/simd/forall.hpp @@ -59,24 +59,7 @@ RAJA_INLINE void forall_impl(const simd_exec &, } } -// -//template -//RAJA_INLINE void forall_impl(const simd_vector_exec> &, -// Iterable &&iter, -// Func &&loop_body) -//{ -// auto begin = std::begin(iter); -// auto end = std::end(iter); -// auto distance = std::distance(begin, end); -// -// using index_type = camp::decay; -// using simd_index_type = FixedRegisterIndex; -// -// for (decltype(distance) i = 0; i < distance; i+=Value::s_num_elem) { -// loop_body(simd_index_type(*(begin + i))); -// } -//} -// + template RAJA_INLINE void forall_impl(const simd_vector_exec &, diff --git a/include/RAJA/policy/simd/policy.hpp b/include/RAJA/policy/simd/policy.hpp index 3133b2121f..924e8107ec 100644 --- a/include/RAJA/policy/simd/policy.hpp +++ b/include/RAJA/policy/simd/policy.hpp @@ -91,8 +91,6 @@ class VectorIndex { -struct simd_register{}; - } // end of namespace simd } // end of namespace policy @@ -100,7 +98,6 @@ struct simd_register{}; using policy::simd::simd_exec; using policy::simd::simd_vector_exec; using policy::simd::VectorIndex; -using policy::simd::simd_register; } // end of namespace RAJA diff --git a/include/RAJA/policy/simd/register.hpp b/include/RAJA/policy/simd/register.hpp index ae9b6ad98f..7ce43a3f67 100644 --- a/include/RAJA/policy/simd/register.hpp +++ b/include/RAJA/policy/simd/register.hpp @@ -21,8 +21,27 @@ #include #include -#include -#include -#include +#ifdef __AVX__ +#include +#endif + +namespace RAJA +{ +namespace policy +{ + namespace simd + { + + // This sets the default SIMD register that will be used + // Individual registers can + using simd_register = simd_avx_register; + } +} + + + + using policy::simd::simd_register; + +} #endif diff --git a/include/RAJA/policy/simd/register/avx.hpp b/include/RAJA/policy/simd/register/avx.hpp new file mode 100644 index 0000000000..715fd3f622 --- /dev/null +++ b/include/RAJA/policy/simd/register/avx.hpp @@ -0,0 +1,30 @@ +/*! + ****************************************************************************** + * + * \file + * + * \brief Header file containing RAJA simd policy definitions. + * + ****************************************************************************** + */ + +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-19, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#ifndef RAJA_policy_simd_register_avx_HPP +#define RAJA_policy_simd_register_avx_HPP + +namespace RAJA { + struct simd_avx_register {}; +} + + +#endif + +#include +#include +#include diff --git a/include/RAJA/policy/simd/register/avx_double2.hpp b/include/RAJA/policy/simd/register/avx_double2.hpp index 9b24d98b4d..5ada2cb5d7 100644 --- a/include/RAJA/policy/simd/register/avx_double2.hpp +++ b/include/RAJA/policy/simd/register/avx_double2.hpp @@ -31,9 +31,9 @@ namespace RAJA template<> - class Register{ + class Register{ public: - using self_type = Register; + using self_type = Register; using element_type = double; static constexpr size_t s_num_elem = 2; diff --git a/include/RAJA/policy/simd/register/avx_double3.hpp b/include/RAJA/policy/simd/register/avx_double3.hpp index 37d7ea7230..cd439105e2 100644 --- a/include/RAJA/policy/simd/register/avx_double3.hpp +++ b/include/RAJA/policy/simd/register/avx_double3.hpp @@ -31,9 +31,9 @@ namespace RAJA template<> - class Register{ + class Register{ public: - using self_type = Register; + using self_type = Register; using element_type = double; static constexpr size_t s_num_elem = 3; @@ -50,7 +50,7 @@ namespace RAJA // Mask used to mask off the upper double from the vector using mask_type = __m256i; - static constexpr mask_type s_mask = (__m256i)(__v4di){ -1, -1, -1, 0}; + //static constexpr mask_type s_mask = (__m256i)(__v4di){ -1, -1, -1, 0}; public: @@ -87,7 +87,7 @@ namespace RAJA */ RAJA_INLINE void load(element_type const *ptr){ - m_value = _mm256_maskload_pd(ptr, s_mask); + m_value = _mm256_maskload_pd(ptr, (__m256i)(__v4di){ -1, -1, -1, 0}); } /*! @@ -113,7 +113,7 @@ namespace RAJA */ RAJA_INLINE void store(element_type *ptr) const{ - _mm256_maskstore_pd(ptr, m_value, s_mask); + _mm256_maskstore_pd(ptr, (__m256i)(__v4di){ -1, -1, -1, 0}, m_value); } /*! diff --git a/include/RAJA/policy/simd/register/avx_double4.hpp b/include/RAJA/policy/simd/register/avx_double4.hpp index 5216de2ad3..3c9108eaee 100644 --- a/include/RAJA/policy/simd/register/avx_double4.hpp +++ b/include/RAJA/policy/simd/register/avx_double4.hpp @@ -31,9 +31,9 @@ namespace RAJA template<> - class Register{ + class Register{ public: - using self_type = Register; + using self_type = Register; using element_type = double; static constexpr size_t s_num_elem = 4; diff --git a/test/unit/CMakeLists.txt b/test/unit/CMakeLists.txt index 9878c514ab..b4e7e9a525 100644 --- a/test/unit/CMakeLists.txt +++ b/test/unit/CMakeLists.txt @@ -73,6 +73,10 @@ raja_add_test( NAME test-kernel-lambda-args SOURCES test-kernel-lambda-args.cpp) +raja_add_test( + NAME test-register + SOURCES test-register.cpp) + raja_add_test( NAME test-vector SOURCES test-vector.cpp) diff --git a/test/unit/test-register.cpp b/test/unit/test-register.cpp new file mode 100644 index 0000000000..1086d57a5c --- /dev/null +++ b/test/unit/test-register.cpp @@ -0,0 +1,329 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-19, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +/// +/// Source file containing tests for basic simd/simt vector operations +/// + +#include "RAJA/RAJA.hpp" +#include "gtest/gtest.h" + +#include "RAJA/pattern/register.hpp" + + +using RegisterTestTypes = ::testing::Types< + RAJA::Register, + RAJA::Register, + RAJA::Register, + RAJA::Register, + RAJA::Register, + RAJA::Register, + RAJA::FixedVector, 27>, + RAJA::FixedVector, 27>, + RAJA::FixedVector, 27>, + RAJA::FixedVector, 27>, + RAJA::StreamVector, 4>, + RAJA::StreamVector, 8>, + RAJA::StreamVector, 12>, + RAJA::StreamVector, 16>>; + + +template +class RegisterTest : public ::testing::Test +{ +protected: + + RegisterTest() = default; + virtual ~RegisterTest() = default; + + virtual void SetUp() + { + } + + virtual void TearDown() + { + } +}; +TYPED_TEST_CASE_P(RegisterTest); + + +/* + * We are using drand48() for input values so the compiler cannot do fancy + * things, like constexpr out all of the intrinsics. + */ + +TYPED_TEST_P(RegisterTest, SimdRegisterSetGet) +{ + + using register_t = TypeParam; + + using element_t = typename register_t::element_type; + static constexpr size_t num_elem = register_t::s_num_elem; + + element_t A[num_elem]; + register_t x; + for(size_t i = 0;i < num_elem; ++ i){ + A[i] = (element_t)(drand48()*1000.0); + x.set(i, A[i]); + } + + for(size_t i = 0;i < num_elem; ++ i){ + ASSERT_DOUBLE_EQ(x[i], A[i]); + } + +} + + +TYPED_TEST_P(RegisterTest, SimdRegisterLoad) +{ + + using register_t = TypeParam; + + using element_t = typename register_t::element_type; + static constexpr size_t num_elem = register_t::s_num_elem; + + element_t A[num_elem*2]; + for(size_t i = 0;i < num_elem*2; ++ i){ + A[i] = (element_t)(drand48()*1000.0); + } + + + // load stride-1 from pointer + register_t x; + x.load(A); + + for(size_t i = 0;i < num_elem; ++ i){ + ASSERT_DOUBLE_EQ(x[i], A[i]); + } + + // load stride-2from pointer + register_t y; + y.load(A, 2); + + for(size_t i = 0;i < num_elem; ++ i){ + ASSERT_DOUBLE_EQ(y[i], A[i*2]); + } +} + + + +TYPED_TEST_P(RegisterTest, SimdRegisterAdd) +{ + + using register_t = TypeParam; + + using element_t = typename register_t::element_type; + static constexpr size_t num_elem = register_t::s_num_elem; + + element_t A[num_elem], B[num_elem]; + register_t x, y; + + for(size_t i = 0;i < num_elem; ++ i){ + A[i] = (element_t)(drand48()*1000.0); + B[i] = (element_t)(drand48()*1000.0); + x.set(i, A[i]); + y.set(i, B[i]); + } + + register_t z = x+y; + + for(size_t i = 0;i < num_elem; ++ i){ + ASSERT_DOUBLE_EQ(z[i], A[i] + B[i]); + } + + register_t z2 = x; + z2 += y; + + for(size_t i = 0;i < num_elem; ++ i){ + ASSERT_DOUBLE_EQ(z2[i], A[i] + B[i]); + } + +} + +TYPED_TEST_P(RegisterTest, SimdRegisterSubtract) +{ + + using register_t = TypeParam; + + using element_t = typename register_t::element_type; + static constexpr size_t num_elem = register_t::s_num_elem; + + element_t A[num_elem], B[num_elem]; + register_t x, y; + + for(size_t i = 0;i < num_elem; ++ i){ + A[i] = (element_t)(drand48()*1000.0); + B[i] = (element_t)(drand48()*1000.0); + x.set(i, A[i]); + y.set(i, B[i]); + } + + register_t z = x-y; + + for(size_t i = 0;i < num_elem; ++ i){ + ASSERT_DOUBLE_EQ(z[i], A[i] - B[i]); + } + + register_t z2 = x; + z2 -= y; + + for(size_t i = 0;i < num_elem; ++ i){ + ASSERT_DOUBLE_EQ(z2[i], A[i] - B[i]); + } +} + +TYPED_TEST_P(RegisterTest, SimdRegisterMultiply) +{ + + using register_t = TypeParam; + + using element_t = typename register_t::element_type; + static constexpr size_t num_elem = register_t::s_num_elem; + + element_t A[num_elem], B[num_elem]; + register_t x, y; + + for(size_t i = 0;i < num_elem; ++ i){ + A[i] = (element_t)(drand48()*1000.0); + B[i] = (element_t)(drand48()*1000.0); + x.set(i, A[i]); + y.set(i, B[i]); + } + + register_t z = x*y; + + for(size_t i = 0;i < num_elem; ++ i){ + ASSERT_DOUBLE_EQ(z[i], A[i] * B[i]); + } + + register_t z2 = x; + z2 *= y; + + for(size_t i = 0;i < num_elem; ++ i){ + ASSERT_DOUBLE_EQ(z2[i], A[i] * B[i]); + } +} + +TYPED_TEST_P(RegisterTest, SimdRegisterDivide) +{ + + using register_t = TypeParam; + + using element_t = typename register_t::element_type; + static constexpr size_t num_elem = register_t::s_num_elem; + + element_t A[num_elem], B[num_elem]; + register_t x, y; + + for(size_t i = 0;i < num_elem; ++ i){ + A[i] = (element_t)(drand48()*1000.0); + B[i] = (element_t)(drand48()*1000.0)+1.0; + x.set(i, A[i]); + y.set(i, B[i]); + } + + register_t z = x/y; + + for(size_t i = 0;i < num_elem; ++ i){ + ASSERT_DOUBLE_EQ(z[i], A[i] / B[i]); + } + + register_t z2 = x; + z2 /= y; + + for(size_t i = 0;i < num_elem; ++ i){ + ASSERT_DOUBLE_EQ(z2[i], A[i] / B[i]); + } +} + +TYPED_TEST_P(RegisterTest, SimdRegisterDotProduct) +{ + + using register_t = TypeParam; + + using element_t = typename register_t::element_type; + static constexpr size_t num_elem = register_t::s_num_elem; + + element_t A[num_elem], B[num_elem]; + register_t x, y; + + element_t expected = 0.0; + for(size_t i = 0;i < num_elem; ++ i){ + A[i] = (element_t)(drand48()*1000.0); + B[i] = (element_t)(drand48()*1000.0); + x.set(i, A[i]); + y.set(i, B[i]); + expected += A[i]*B[i]; + } + + ASSERT_DOUBLE_EQ(x.dot(y), expected); + +} + +TYPED_TEST_P(RegisterTest, SimdRegisterMax) +{ + using register_t = TypeParam; + + using element_t = typename register_t::element_type; + static constexpr size_t num_elem = register_t::s_num_elem; + + element_t A[num_elem]; + register_t x; + + for(size_t i = 0;i < num_elem; ++ i){ + A[i] = (element_t)(drand48()*1000.0); + x.set(i, A[i]); + } + + element_t expected = A[0]; + for(size_t i = 1;i < num_elem;++ i){ + expected = expected > A[i] ? expected : A[i]; + } + + ASSERT_DOUBLE_EQ(x.max(), expected); + +} + +TYPED_TEST_P(RegisterTest, SimdRegisterMin) +{ + using register_t = TypeParam; + + using element_t = typename register_t::element_type; + static constexpr size_t num_elem = register_t::s_num_elem; + + element_t A[num_elem]; + register_t x; + + for(size_t i = 0;i < num_elem; ++ i){ + A[i] = (element_t)(drand48()*1000.0); + x.set(i, A[i]); + } + + element_t expected = A[0]; + for(size_t i = 1;i < num_elem;++ i){ + expected = expected < A[i] ? expected : A[i]; + } + + ASSERT_DOUBLE_EQ(x.min(), expected); + +} + + +REGISTER_TYPED_TEST_CASE_P(RegisterTest, SimdRegisterSetGet, + SimdRegisterLoad, + SimdRegisterAdd, + SimdRegisterSubtract, + SimdRegisterMultiply, + SimdRegisterDivide, + SimdRegisterDotProduct, + SimdRegisterMax, + SimdRegisterMin); + +INSTANTIATE_TYPED_TEST_CASE_P(SIMD, RegisterTest, RegisterTestTypes); + + + diff --git a/test/unit/test-vector.cpp b/test/unit/test-vector.cpp index cc32b74b95..9fffe895b5 100644 --- a/test/unit/test-vector.cpp +++ b/test/unit/test-vector.cpp @@ -15,33 +15,21 @@ #include "RAJA/pattern/register.hpp" #include "RAJA/pattern/vector.hpp" -#if 0 -using RegisterTestTypes = ::testing::Types< - RAJA::Register, - RAJA::Register, - RAJA::Register, - RAJA::Register, - RAJA::Register, - RAJA::Register, - RAJA::FixedVector, 27>, - RAJA::FixedVector, 27>, - RAJA::FixedVector, 27>, - RAJA::FixedVector, 27>, - RAJA::StreamVector, 4>, - RAJA::StreamVector, 8>, - RAJA::StreamVector, 12>, - RAJA::StreamVector, 16>>; +using VectorTestTypes = ::testing::Types< + RAJA::FixedVector, 4>, + RAJA::FixedVector, 8>, + RAJA::StreamVector, 4>, + RAJA::StreamVector, 8>>; -//usingRegister TestTypes = ::testing::Types, 27>>; -template -class RegisterTest : public ::testing::Test +template +class VectorTest : public ::testing::Test { protected: - RegisterTest() = default; - virtual ~RegisterTest() = default; + VectorTest() = default; + virtual ~VectorTest() = default; virtual void SetUp() { @@ -51,464 +39,18 @@ class RegisterTest : public ::testing::Test { } }; -TYPED_TEST_CASE_P(RegisterTest); +TYPED_TEST_CASE_P(VectorTest); -/* - * We are using drand48() for input values so the compiler cannot do fancy - * things, like constexpr out all of the intrinsics. - */ -TYPED_TEST_P(RegisterTest, SimdRegisterSetGet) +TYPED_TEST_P(VectorTest, ForallVectorRef1d) { - - using register_t = TypeParam; - - using element_t = typename register_t::element_type; - static constexpr size_t num_elem = register_t::s_num_elem; - - element_t A[num_elem]; - register_t x; - for(size_t i = 0;i < num_elem; ++ i){ - A[i] = (element_t)(drand48()*1000.0); - x.set(i, A[i]); - } - - for(size_t i = 0;i < num_elem; ++ i){ - ASSERT_DOUBLE_EQ(x[i], A[i]); - } - -} - - -TYPED_TEST_P(RegisterTest, SimdRegisterLoad) -{ - - using register_t = TypeParam; - - using element_t = typename register_t::element_type; - static constexpr size_t num_elem = register_t::s_num_elem; - - element_t A[num_elem*2]; - for(size_t i = 0;i < num_elem*2; ++ i){ - A[i] = (element_t)(drand48()*1000.0); - } - - - // load stride-1 from pointer - register_t x; - x.load(A); - - for(size_t i = 0;i < num_elem; ++ i){ - ASSERT_DOUBLE_EQ(x[i], A[i]); - } - - // load stride-2from pointer - register_t y; - y.load(A, 2); - - for(size_t i = 0;i < num_elem; ++ i){ - ASSERT_DOUBLE_EQ(y[i], A[i*2]); - } -} - - - -TYPED_TEST_P(RegisterTest, SimdRegisterAdd) -{ - - using register_t = TypeParam; - - using element_t = typename register_t::element_type; - static constexpr size_t num_elem = register_t::s_num_elem; - - element_t A[num_elem], B[num_elem]; - register_t x, y; - - for(size_t i = 0;i < num_elem; ++ i){ - A[i] = (element_t)(drand48()*1000.0); - B[i] = (element_t)(drand48()*1000.0); - x.set(i, A[i]); - y.set(i, B[i]); - } - - register_t z = x+y; - - for(size_t i = 0;i < num_elem; ++ i){ - ASSERT_DOUBLE_EQ(z[i], A[i] + B[i]); - } - - register_t z2 = x; - z2 += y; - - for(size_t i = 0;i < num_elem; ++ i){ - ASSERT_DOUBLE_EQ(z2[i], A[i] + B[i]); - } - -} - -TYPED_TEST_P(RegisterTest, SimdRegisterSubtract) -{ - - using register_t = TypeParam; - - using element_t = typename register_t::element_type; - static constexpr size_t num_elem = register_t::s_num_elem; - - element_t A[num_elem], B[num_elem]; - register_t x, y; - - for(size_t i = 0;i < num_elem; ++ i){ - A[i] = (element_t)(drand48()*1000.0); - B[i] = (element_t)(drand48()*1000.0); - x.set(i, A[i]); - y.set(i, B[i]); - } - - register_t z = x-y; - - for(size_t i = 0;i < num_elem; ++ i){ - ASSERT_DOUBLE_EQ(z[i], A[i] - B[i]); - } - - register_t z2 = x; - z2 -= y; - - for(size_t i = 0;i < num_elem; ++ i){ - ASSERT_DOUBLE_EQ(z2[i], A[i] - B[i]); - } -} - -TYPED_TEST_P(RegisterTest, SimdRegisterMultiply) -{ - - using register_t = TypeParam; - - using element_t = typename register_t::element_type; - static constexpr size_t num_elem = register_t::s_num_elem; - - element_t A[num_elem], B[num_elem]; - register_t x, y; - - for(size_t i = 0;i < num_elem; ++ i){ - A[i] = (element_t)(drand48()*1000.0); - B[i] = (element_t)(drand48()*1000.0); - x.set(i, A[i]); - y.set(i, B[i]); - } - - register_t z = x*y; - - for(size_t i = 0;i < num_elem; ++ i){ - ASSERT_DOUBLE_EQ(z[i], A[i] * B[i]); - } - - register_t z2 = x; - z2 *= y; - - for(size_t i = 0;i < num_elem; ++ i){ - ASSERT_DOUBLE_EQ(z2[i], A[i] * B[i]); - } -} - -TYPED_TEST_P(RegisterTest, SimdRegisterDivide) -{ - - using register_t = TypeParam; - - using element_t = typename register_t::element_type; - static constexpr size_t num_elem = register_t::s_num_elem; - - element_t A[num_elem], B[num_elem]; - register_t x, y; - - for(size_t i = 0;i < num_elem; ++ i){ - A[i] = (element_t)(drand48()*1000.0); - B[i] = (element_t)(drand48()*1000.0)+1.0; - x.set(i, A[i]); - y.set(i, B[i]); - } - - register_t z = x/y; - - for(size_t i = 0;i < num_elem; ++ i){ - ASSERT_DOUBLE_EQ(z[i], A[i] / B[i]); - } - - register_t z2 = x; - z2 /= y; - - for(size_t i = 0;i < num_elem; ++ i){ - ASSERT_DOUBLE_EQ(z2[i], A[i] / B[i]); - } -} - -TYPED_TEST_P(RegisterTest, SimdRegisterDotProduct) -{ - - using register_t = TypeParam; - - using element_t = typename register_t::element_type; - static constexpr size_t num_elem = register_t::s_num_elem; - - element_t A[num_elem], B[num_elem]; - register_t x, y; - - element_t expected = 0.0; - for(size_t i = 0;i < num_elem; ++ i){ - A[i] = (element_t)(drand48()*1000.0); - B[i] = (element_t)(drand48()*1000.0); - x.set(i, A[i]); - y.set(i, B[i]); - expected += A[i]*B[i]; - } - - ASSERT_DOUBLE_EQ(x.dot(y), expected); - -} - -TYPED_TEST_P(RegisterTest, SimdRegisterMax) -{ - using register_t = TypeParam; - - using element_t = typename register_t::element_type; - static constexpr size_t num_elem = register_t::s_num_elem; - - element_t A[num_elem]; - register_t x; - - for(size_t i = 0;i < num_elem; ++ i){ - A[i] = (element_t)(drand48()*1000.0); - x.set(i, A[i]); - } - - element_t expected = A[0]; - for(size_t i = 1;i < num_elem;++ i){ - expected = expected > A[i] ? expected : A[i]; - } - - ASSERT_DOUBLE_EQ(x.max(), expected); - -} - -TYPED_TEST_P(RegisterTest, SimdRegisterMin) -{ - using register_t = TypeParam; - - using element_t = typename register_t::element_type; - static constexpr size_t num_elem = register_t::s_num_elem; - - element_t A[num_elem]; - register_t x; - - for(size_t i = 0;i < num_elem; ++ i){ - A[i] = (element_t)(drand48()*1000.0); - x.set(i, A[i]); - } - - element_t expected = A[0]; - for(size_t i = 1;i < num_elem;++ i){ - expected = expected < A[i] ? expected : A[i]; - } - - ASSERT_DOUBLE_EQ(x.min(), expected); - -} - - -REGISTER_TYPED_TEST_CASE_P(RegisterTest, SimdRegisterSetGet, - SimdRegisterLoad, - SimdRegisterAdd, - SimdRegisterSubtract, - SimdRegisterMultiply, - SimdRegisterDivide, - SimdRegisterDotProduct, - SimdRegisterMax, - SimdRegisterMin); - -INSTANTIATE_TYPED_TEST_CASE_P(SIMD, RegisterTest, RegisterTestTypes); - - - - -TEST(StreamVectorTest, Test1) -{ - using TypeParam = RAJA::StreamVector, 8>; - using register_t = TypeParam; - - using element_t = typename register_t::element_type; - static constexpr size_t num_elem = register_t::s_num_elem; - - element_t A[num_elem], B[num_elem]; - for(size_t i = 0;i < num_elem; ++ i){ - A[i] = (element_t)(drand48()*1000.0); - } - - - for(size_t N = 1;N <= 8;++ N){ - for(size_t i = 0;i < 8;++ i){ - B[i] = 0; - } - - register_t x, y; - x.load_n(A, N); - y = 3.0; - x = x+y; - x.store(B); - - for(size_t i = 0;i < 8;++ i){ - if(i < N){ - ASSERT_DOUBLE_EQ(B[i], A[i]+3.0); - } - else - { - ASSERT_DOUBLE_EQ(B[i], 0.0); - } - } - } -} - -TEST(StreamVectorTest, TestStreamLoop) -{ - using TypeParam = RAJA::StreamVector, 32>; - using register_t = TypeParam; - - using element_t = typename register_t::element_type; - static constexpr size_t num_elem = register_t::s_num_elem; - - // Use drand48 to change sizes of everything: this ensures that the compiler - // cannot optimize out sizes (and do more optimization than we want) - size_t N = 8000 + (100*drand48()); - - element_t *A = new element_t[N]; - element_t *B = new element_t[N]; - element_t *C = new element_t[N]; - for(size_t i = 0;i < N; ++ i){ - A[i] = (element_t)(drand48()*1000.0); - B[i] = (element_t)(drand48()*1000.0); - } - - size_t Nsimd = N - (N%num_elem); - size_t Nrem = N - Nsimd; - for(size_t i = 0;i < Nsimd;i += num_elem){ - register_t x,y; - x.load_n(&A[i],num_elem); - y.load_n(&B[i],num_elem); - - register_t z = x*y; - z.store(&C[i]); - } - if(Nrem > 0){ - register_t x,y; - x.load_n(&A[Nsimd], Nrem); - y.load_n(&B[Nsimd], Nrem); - - register_t z = x*y; - z.store(&C[Nsimd]); - } - - for(size_t i = 0;i < N;i ++){ - ASSERT_DOUBLE_EQ(A[i]*B[i], C[i]); - } - - delete[] A; - delete[] B; - delete[] C; -} - -TEST(StreamVectorTest, TestFixedForall) -{ - using TypeParam = RAJA::FixedVector, 8>; - using register_t = TypeParam; - - using element_t = typename register_t::element_type; - static constexpr size_t num_elem = register_t::s_num_elem; - - - size_t N = 1024*num_elem; - - element_t *A = new element_t[N]; - element_t *B = new element_t[N]; - element_t *C = new element_t[N]; - for(size_t i = 0;i < N; ++ i){ - A[i] = (element_t)(drand48()*1000.0); - B[i] = (element_t)(drand48()*1000.0); - } - - using policy_t = RAJA::simd_fixed_exec; - - RAJA::forall(RAJA::TypedRangeSegment(0, N), - [=](RAJA::FixedRegisterIndex i) - { - register_t x,y; - x.load(&A[*i]); - y.load(&B[*i]); - - register_t z = x*y; - z.store(&C[*i]); - }); - - - for(size_t i = 0;i < N;i ++){ - ASSERT_DOUBLE_EQ(A[i]*B[i], C[i]); - } - - delete[] A; - delete[] B; - delete[] C; -} - -TEST(StreamVectorTest, TestStreamForall) -{ - using TypeParam = RAJA::StreamVector, 8>; - using register_t = TypeParam; - - using element_t = typename register_t::element_type; - - - size_t N = 8000 + (100*drand48()); - - element_t *A = new element_t[N]; - element_t *B = new element_t[N]; - element_t *C = new element_t[N]; - for(size_t i = 0;i < N; ++ i){ - A[i] = (element_t)(drand48()*1000.0); - B[i] = (element_t)(drand48()*1000.0); - } - - using policy_t = RAJA::simd_stream_exec; - - RAJA::forall(RAJA::TypedRangeSegment(0, N), - [=](RAJA::StreamRegisterIndex i) - { - register_t x,y; - x.load_n(&A[*i], i.size()); - y.load_n(&B[*i], i.size()); - - register_t z = x*y; - z.store(&C[*i]); - }); - - - for(size_t i = 0;i < N;i ++){ - ASSERT_DOUBLE_EQ(A[i]*B[i], C[i]); - } - - delete[] A; - delete[] B; - delete[] C; -} -#endif - -TEST(StreamVectorTest, TestStreamForallRef) -{ - using TypeParam = RAJA::StreamVector, 8>; using vector_t = TypeParam; using element_t = typename vector_t::element_type; - size_t N = 8000 + (100*drand48()); + size_t N = 8000;// + (100*drand48()); element_t *A = new element_t[N]; element_t *B = new element_t[N]; @@ -540,3 +82,8 @@ TEST(StreamVectorTest, TestStreamForallRef) delete[] B; delete[] C; } + + +REGISTER_TYPED_TEST_CASE_P(VectorTest, ForallVectorRef1d); + +INSTANTIATE_TYPED_TEST_CASE_P(SIMD, VectorTest, VectorTestTypes); From b3cfba37b8ff7ec6ef30428e888661c6cdd2bc1f Mon Sep 17 00:00:00 2001 From: "Adam J. Kunen" Date: Fri, 8 Nov 2019 14:50:06 -0800 Subject: [PATCH 010/593] Fixed views to work with stream/fixed vectors --- include/RAJA/pattern/vector/FixedVector.hpp | 52 ++++++----- include/RAJA/pattern/vector/StreamVector.hpp | 68 +++++++------- include/RAJA/pattern/vector/VectorRef.hpp | 93 +++++++++++++++++++- include/RAJA/util/View.hpp | 2 +- test/unit/test-vector.cpp | 7 +- 5 files changed, 164 insertions(+), 58 deletions(-) diff --git a/include/RAJA/pattern/vector/FixedVector.hpp b/include/RAJA/pattern/vector/FixedVector.hpp index ad1be3047e..2761ab3800 100644 --- a/include/RAJA/pattern/vector/FixedVector.hpp +++ b/include/RAJA/pattern/vector/FixedVector.hpp @@ -49,6 +49,7 @@ namespace RAJA using self_type = FixedVector; using element_type = ELEMENT_TYPE; + static constexpr size_t s_is_fixed = true; static constexpr size_t s_num_elem = NUM_ELEM; static constexpr size_t s_byte_width = sizeof(element_type); @@ -106,15 +107,15 @@ namespace RAJA * @brief Load constructor, assuming scalars are in consecutive memory * locations. */ - RAJA_INLINE - void load(element_type const *ptr){ - for(size_t i = 0;i < s_num_full_registers;++ i){ - m_full_registers[i].load(ptr + i*s_num_register_elem); - } - if(s_num_partial_registers){ - m_partial_register[0].load(ptr + s_num_full_elem); - } - } +// RAJA_INLINE +// void load(element_type const *ptr){ +// for(size_t i = 0;i < s_num_full_registers;++ i){ +// m_full_registers[i].load(ptr + i*s_num_register_elem); +// } +// if(s_num_partial_registers){ +// m_partial_register[0].load(ptr + s_num_full_elem); +// } +// } /*! * @brief Strided load constructor, when scalars are located in memory @@ -125,7 +126,7 @@ namespace RAJA * available. (like in avx2, but not in avx) */ RAJA_INLINE - void load(element_type const *ptr, size_t stride){ + void load(element_type const *ptr, size_t stride = 1){ for(size_t i = 0;i < s_num_full_registers;++ i){ m_full_registers[i].load(ptr + i*stride*s_num_register_elem, stride); } @@ -134,21 +135,32 @@ namespace RAJA } } - /*! - * @brief Store operation, assuming scalars are in consecutive memory + * @brief Load constructor, assuming scalars are in consecutive memory * locations. + * + * Since this is a Fixed length vector, the length arguments is ignored */ RAJA_INLINE - void store(element_type *ptr) const{ - for(size_t i = 0;i < s_num_full_registers;++ i){ - m_full_registers[i].store(ptr + i*s_num_register_elem); - } - if(s_num_partial_registers){ - m_partial_register[0].store(ptr + s_num_full_elem); - } + void load_n(element_type const *ptr, size_t , size_t stride = 1){ + load(ptr, stride); } + + /*! + * @brief Store operation, assuming scalars are in consecutive memory + * locations. + */ +// RAJA_INLINE +// void store(element_type *ptr) const{ +// for(size_t i = 0;i < s_num_full_registers;++ i){ +// m_full_registers[i].store(ptr + i*s_num_register_elem); +// } +// if(s_num_partial_registers){ +// m_partial_register[0].store(ptr + s_num_full_elem); +// } +// } + /*! * @brief Strided store operation, where scalars are stored in memory * locations ptr, ptr+stride, ptr+2*stride, etc. @@ -158,7 +170,7 @@ namespace RAJA * available. */ RAJA_INLINE - void store(element_type *ptr, size_t stride) const{ + void store(element_type *ptr, size_t stride = 1) const{ for(size_t i = 0;i < s_num_full_registers;++ i){ m_full_registers[i].store(ptr + i*stride*s_num_register_elem, stride); } diff --git a/include/RAJA/pattern/vector/StreamVector.hpp b/include/RAJA/pattern/vector/StreamVector.hpp index 20a53d0401..c643aa764b 100644 --- a/include/RAJA/pattern/vector/StreamVector.hpp +++ b/include/RAJA/pattern/vector/StreamVector.hpp @@ -46,6 +46,8 @@ namespace RAJA using self_type = StreamVector; using element_type = ELEMENT_TYPE; + static constexpr size_t s_is_fixed = false; + static constexpr size_t s_num_elem = MAX_ELEM; static constexpr size_t s_num_registers = s_num_elem / s_num_register_elem; @@ -128,13 +130,13 @@ namespace RAJA * @brief Load constructor, assuming scalars are in consecutive memory * locations. */ - RAJA_INLINE - void load(element_type const *ptr){ - m_length = s_num_elem; - for(size_t i = 0;i < s_num_registers;++ i){ - m_registers[i].load(ptr + i*s_num_register_elem); - } - } +// RAJA_INLINE +// void load(element_type const *ptr){ +// m_length = s_num_elem; +// for(size_t i = 0;i < s_num_registers;++ i){ +// m_registers[i].load(ptr + i*s_num_register_elem); +// } +// } /*! * @brief Strided load constructor, when scalars are located in memory @@ -145,7 +147,7 @@ namespace RAJA * available. (like in avx2, but not in avx) */ RAJA_INLINE - void load(element_type const *ptr, size_t stride){ + void load(element_type const *ptr, size_t stride = 1){ m_length = s_num_elem; for(size_t i = 0;i < s_num_registers;++ i){ m_registers[i].load(ptr + i*s_num_register_elem*stride, stride); @@ -157,17 +159,17 @@ namespace RAJA * @brief Load constructor, assuming scalars are in consecutive memory * locations. */ - void load_n(element_type const *ptr, size_t len){ - if(len == s_num_elem){ - load(ptr); - } - else{ - m_length = len; - for(size_t i = 0;i < len;++ i){ - set(i, ptr[i]); - } - } - } +// void load_n(element_type const *ptr, size_t len){ +// if(len == s_num_elem){ +// load(ptr); +// } +// else{ +// m_length = len; +// for(size_t i = 0;i < len;++ i){ +// set(i, ptr[i]); +// } +// } +// } /*! * @brief Strided load constructor, when scalars are located in memory @@ -177,7 +179,7 @@ namespace RAJA * Note: this could be done with "gather" instructions if they are * available. (like in avx2, but not in avx) */ - void load_n(element_type const *ptr, size_t len, size_t stride){ + void load_n(element_type const *ptr, size_t len, size_t stride = 1){ if(len == s_num_elem){ load(ptr, stride); } @@ -194,18 +196,18 @@ namespace RAJA * @brief Store operation, assuming scalars are in consecutive memory * locations. */ - void store(element_type *ptr) const{ - if(m_length == s_num_elem){ - for(size_t i = 0;i < s_num_registers;++ i){ - m_registers[i].store(ptr + i*s_num_register_elem); - } - } - else{ - for(size_t i = 0;i < m_length;++ i){ - ptr[i] = (*this)[i]; - } - } - } +// void store(element_type *ptr) const{ +// if(m_length == s_num_elem){ +// for(size_t i = 0;i < s_num_registers;++ i){ +// m_registers[i].store(ptr + i*s_num_register_elem); +// } +// } +// else{ +// for(size_t i = 0;i < m_length;++ i){ +// ptr[i] = (*this)[i]; +// } +// } +// } /*! * @brief Strided store operation, where scalars are stored in memory @@ -215,7 +217,7 @@ namespace RAJA * Note: this could be done with "scatter" instructions if they are * available. */ - void store(element_type *ptr, size_t stride) const{ + void store(element_type *ptr, size_t stride = 1) const{ if(m_length == s_num_elem){ for(size_t i = 0;i < s_num_registers;++ i){ m_registers[i].store(ptr + i*s_num_register_elem*stride, stride); diff --git a/include/RAJA/pattern/vector/VectorRef.hpp b/include/RAJA/pattern/vector/VectorRef.hpp index 6b4bb9155d..70cf8df96c 100644 --- a/include/RAJA/pattern/vector/VectorRef.hpp +++ b/include/RAJA/pattern/vector/VectorRef.hpp @@ -27,6 +27,92 @@ namespace RAJA { +// namespace internal +// { +// +// template +// struct VectorRefLoadStoreHelper; +// +// template +// struct VectorRefLoadStoreHelper, false>{ +// +// using vector_type = FixedVector; +// +// template +// RAJA_INLINE +// static vector_type load(PTR data, IDX, IDX stride){ +// vector_type value; +// value.load(data, stride); +// return value; +// } +// +// template +// RAJA_INLINE +// static void store(vector_type value, PTR data, IDX stride){ +// value.store(data, stride); +// } +// }; +// +// template +// struct VectorRefLoadStoreHelper, true>{ +// +// using vector_type = FixedVector; +// +// template +// RAJA_INLINE +// static vector_type load(PTR data, IDX, IDX){ +// vector_type value; +// value.load(data); +// return value; +// } +// +// template +// RAJA_INLINE +// static void store(vector_type value, PTR data, IDX){ +// value.store(data); +// } +// }; +// +// template +// struct VectorRefLoadStoreHelper, false>{ +// +// using vector_type = StreamVector; +// +// template +// RAJA_INLINE +// static vector_type load(PTR data, IDX length, IDX stride){ +// vector_type value; +// value.load_n(data, length, stride); +// return value; +// } +// +// template +// RAJA_INLINE +// static void store(vector_type value, PTR data, IDX stride){ +// value.store(data, stride); +// } +// }; +// +// template +// struct VectorRefLoadStoreHelper, true>{ +// +// using vector_type = StreamVector; +// +// template +// RAJA_INLINE +// static vector_type load(PTR data, IDX length, IDX){ +// vector_type value; +// value.load_n(data, length); +// return value; +// } +// +// template +// RAJA_INLINE +// static void store(vector_type value, PTR data, IDX){ +// value.store(data); +// } +// }; +// } /*! * \file @@ -53,7 +139,7 @@ namespace RAJA index_type m_linear_index; index_type m_length; pointer_type m_data; - size_t m_stride; + index_type m_stride; public: @@ -95,6 +181,7 @@ namespace RAJA RAJA_INLINE void store(vector_type value) const { +// internal::VectorRefLoadStoreHelper::store(value, m_data+m_linear_index, m_stride); if(STRIDE_ONE){ value.store(m_data+m_linear_index); } @@ -112,10 +199,10 @@ namespace RAJA { vector_type value; if(STRIDE_ONE){ - value.load(m_data+m_linear_index); + value.load_n(m_data+m_linear_index, m_length); } else{ - value.load(m_data+m_linear_index, m_stride); + value.load_n(m_data+m_linear_index, m_length, m_stride); } return value; } diff --git a/include/RAJA/util/View.hpp b/include/RAJA/util/View.hpp index 0128daa33d..41329d8ac6 100644 --- a/include/RAJA/util/View.hpp +++ b/include/RAJA/util/View.hpp @@ -127,7 +127,7 @@ struct View { // Stuff it back into the index using ref_type = VectorRef; - return ref_type(idx, REGISTER::s_num_elem, data, 1); + return ref_type(idx, arg.size(), data, 1); } }; diff --git a/test/unit/test-vector.cpp b/test/unit/test-vector.cpp index 9fffe895b5..662ec7fe44 100644 --- a/test/unit/test-vector.cpp +++ b/test/unit/test-vector.cpp @@ -50,7 +50,12 @@ TYPED_TEST_P(VectorTest, ForallVectorRef1d) using element_t = typename vector_t::element_type; - size_t N = 8000;// + (100*drand48()); + size_t N = 8000; + // If we are not using fixed vectors, add some random number of elements + // to the array to test some postamble code generation. + if(!vector_t::s_is_fixed){ + N += (100*drand48()); + } element_t *A = new element_t[N]; element_t *B = new element_t[N]; From a0b055b69d9b849f2d6958581a52d663a34ada87 Mon Sep 17 00:00:00 2001 From: "Adam J. Kunen" Date: Fri, 8 Nov 2019 16:05:42 -0800 Subject: [PATCH 011/593] Merged implementation of FixedVector and StreamVector, recreated them as type aliases to Vector --- include/RAJA/pattern/vector.hpp | 3 +- include/RAJA/pattern/vector/StreamVector.hpp | 493 ------------------ .../vector/{FixedVector.hpp => Vector.hpp} | 296 +++++------ include/RAJA/pattern/vector/VectorRef.hpp | 87 ---- test/unit/test-register.cpp | 2 - test/unit/test-vector.cpp | 5 +- 6 files changed, 154 insertions(+), 732 deletions(-) delete mode 100644 include/RAJA/pattern/vector/StreamVector.hpp rename include/RAJA/pattern/vector/{FixedVector.hpp => Vector.hpp} (61%) diff --git a/include/RAJA/pattern/vector.hpp b/include/RAJA/pattern/vector.hpp index 3d86f4a6ed..e2c9913d38 100644 --- a/include/RAJA/pattern/vector.hpp +++ b/include/RAJA/pattern/vector.hpp @@ -18,8 +18,7 @@ #ifndef RAJA_pattern_vector_HPP #define RAJA_pattern_vector_HPP -#include "RAJA/pattern/vector/FixedVector.hpp" -#include "RAJA/pattern/vector/StreamVector.hpp" +#include "RAJA/pattern/vector/Vector.hpp" #include "RAJA/pattern/vector/VectorRef.hpp" diff --git a/include/RAJA/pattern/vector/StreamVector.hpp b/include/RAJA/pattern/vector/StreamVector.hpp deleted file mode 100644 index c643aa764b..0000000000 --- a/include/RAJA/pattern/vector/StreamVector.hpp +++ /dev/null @@ -1,493 +0,0 @@ -/*! - ****************************************************************************** - * - * \file - * - * \brief RAJA header file defining SIMD/SIMT register operations. - * - ****************************************************************************** - */ - -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2016-19, Lawrence Livermore National Security, LLC -// and RAJA project contributors. See the RAJA/COPYRIGHT file for details. -// -// SPDX-License-Identifier: (BSD-3-Clause) -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// - -#ifndef RAJA_pattern_vector_streamvector_HPP -#define RAJA_pattern_vector_streamvector_HPP - -#include "RAJA/config.hpp" -#include "RAJA/util/macros.hpp" - -namespace RAJA -{ - - -/*! - * \file - * Vector operation functions in the namespace RAJA - - * - */ - - template - class StreamVector; - - template class REGISTER_TYPE, typename REGISTER_POLICY, typename ELEMENT_TYPE, size_t NUM_REG_ELEM, size_t MAX_ELEM> - class StreamVector, MAX_ELEM> - { - public: - using register_type = - REGISTER_TYPE; - static constexpr size_t s_num_register_elem = NUM_REG_ELEM; - - using self_type = StreamVector; - using element_type = ELEMENT_TYPE; - - static constexpr size_t s_is_fixed = false; - - static constexpr size_t s_num_elem = MAX_ELEM; - static constexpr size_t s_num_registers = - s_num_elem / s_num_register_elem; - - static_assert(s_num_elem % s_num_register_elem == 0, - "StreamVector must use a whole number of registers"); - - - private: - std::array m_registers; - size_t m_length; - - public: - - - /*! - * @brief Default constructor, zeros register contents - */ - RAJA_INLINE - StreamVector() : m_length(s_num_elem) {} - - /*! - * @brief Copy constructor - */ - RAJA_INLINE - StreamVector(self_type const &c) : - m_registers(c.m_registers), - m_length(c.m_length) - {} - - /*! - * @brief Scalar constructor (broadcast) - */ - RAJA_INLINE - StreamVector(element_type const &c) : m_length(s_num_elem) - { - for(size_t i = 0;i < s_num_registers;++ i){ - m_registers[i] = c; - } - } - - /*! - * @brief Get scalar value from vector - * This will not be the most efficient due to the offset calculation. - * @param i Offset of scalar to get - * @return Returns scalar value at i - */ - RAJA_INLINE - element_type operator[](size_t i) const - { - // compute the register - size_t r = i/s_num_register_elem; - - // compute the element in the register (equiv: i % s_num_register_elem) - size_t e = i - (r*s_num_register_elem); - - return m_registers[r][e]; - } - - - /*! - * @brief Set scalar value in vector register - * @param i Offset of scalar to set - * @param value Value of scalar to set - */ - RAJA_INLINE - void set(size_t i, element_type value) - { - // compute the register - size_t r = i/s_num_register_elem; - - // compute the element in the register (equiv: i % s_num_register_elem) - size_t e = i - (r*s_num_register_elem); - - m_registers[r].set(e, value); - } - - - /*! - * @brief Load constructor, assuming scalars are in consecutive memory - * locations. - */ -// RAJA_INLINE -// void load(element_type const *ptr){ -// m_length = s_num_elem; -// for(size_t i = 0;i < s_num_registers;++ i){ -// m_registers[i].load(ptr + i*s_num_register_elem); -// } -// } - - /*! - * @brief Strided load constructor, when scalars are located in memory - * locations ptr, ptr+stride, ptr+2*stride, etc. - * - * - * Note: this could be done with "gather" instructions if they are - * available. (like in avx2, but not in avx) - */ - RAJA_INLINE - void load(element_type const *ptr, size_t stride = 1){ - m_length = s_num_elem; - for(size_t i = 0;i < s_num_registers;++ i){ - m_registers[i].load(ptr + i*s_num_register_elem*stride, stride); - } - } - - - /*! - * @brief Load constructor, assuming scalars are in consecutive memory - * locations. - */ -// void load_n(element_type const *ptr, size_t len){ -// if(len == s_num_elem){ -// load(ptr); -// } -// else{ -// m_length = len; -// for(size_t i = 0;i < len;++ i){ -// set(i, ptr[i]); -// } -// } -// } - - /*! - * @brief Strided load constructor, when scalars are located in memory - * locations ptr, ptr+stride, ptr+2*stride, etc. - * - * - * Note: this could be done with "gather" instructions if they are - * available. (like in avx2, but not in avx) - */ - void load_n(element_type const *ptr, size_t len, size_t stride = 1){ - if(len == s_num_elem){ - load(ptr, stride); - } - else{ - m_length = len; - for(size_t i = 0;i < len;++ i){ - set(i, ptr[i*stride]); - } - } - } - - - /*! - * @brief Store operation, assuming scalars are in consecutive memory - * locations. - */ -// void store(element_type *ptr) const{ -// if(m_length == s_num_elem){ -// for(size_t i = 0;i < s_num_registers;++ i){ -// m_registers[i].store(ptr + i*s_num_register_elem); -// } -// } -// else{ -// for(size_t i = 0;i < m_length;++ i){ -// ptr[i] = (*this)[i]; -// } -// } -// } - - /*! - * @brief Strided store operation, where scalars are stored in memory - * locations ptr, ptr+stride, ptr+2*stride, etc. - * - * - * Note: this could be done with "scatter" instructions if they are - * available. - */ - void store(element_type *ptr, size_t stride = 1) const{ - if(m_length == s_num_elem){ - for(size_t i = 0;i < s_num_registers;++ i){ - m_registers[i].store(ptr + i*s_num_register_elem*stride, stride); - } - } - else{ - for(size_t i = 0;i < m_length;++ i){ - ptr[i*stride] = (*this)[i]; - } - } - } - - - /*! - * @brief Assign one register to antoher - * @param x Vector to copy - * @return Value of (*this) - */ - RAJA_INLINE - self_type const &operator=(self_type const &x) - { - m_registers = x.m_registers; - m_length = x.m_length; - return *this; - } - - - - /*! - * @brief Assign one register from a scalar - * @param x Vector to copy - * @return Value of (*this) - */ - RAJA_INLINE - self_type const &operator=(element_type const &x) - { - m_length = s_num_elem; - for(size_t i = 0;i < s_num_registers;++ i){ - m_registers[i] = x; - } - return *this; - } - - - /*! - * @brief Add two vector registers - * @param x Vector to add to this register - * @return Value of (*this)+x - */ - RAJA_INLINE - self_type operator+(self_type const &x) const - { - self_type result = *this; - result += x; - return result; - } - - - /*! - * @brief Add a vector to this vector - * @param x Vector to add to this register - * @return Value of (*this)+x - */ - RAJA_INLINE - self_type const &operator+=(self_type const &x) - { - for(size_t i = 0;i < s_num_registers;++ i){ - m_registers[i] += x.m_registers[i]; - } - m_length = std::min(m_length, x.m_length); - return *this; - } - - - /*! - * @brief Subtract two vector registers - * @param x Vector to subctract from this register - * @return Value of (*this)+x - */ - RAJA_INLINE - self_type operator-(self_type const &x) const - { - self_type result = *this; - result -= x; - return result; - } - - /*! - * @brief Subtract a vector from this vector - * @param x Vector to subtract from this register - * @return Value of (*this)+x - */ - RAJA_INLINE - self_type const &operator-=(self_type const &x) - { - for(size_t i = 0;i < s_num_registers;++ i){ - m_registers[i] -= x.m_registers[i]; - } - m_length = std::min(m_length, x.m_length); - return *this; - } - - /*! - * @brief Multiply two vector registers, element wise - * @param x Vector to subctract from this register - * @return Value of (*this)+x - */ - RAJA_INLINE - self_type operator*(self_type const &x) const - { - self_type result = *this; - result *= x; - return result; - } - - /*! - * @brief Multiply a vector with this vector - * @param x Vector to multiple with this register - * @return Value of (*this)+x - */ - RAJA_INLINE - self_type const &operator*=(self_type const &x) - { - for(size_t i = 0;i < s_num_registers;++ i){ - m_registers[i] *= x.m_registers[i]; - } - m_length = std::min(m_length, x.m_length); - return *this; - } - - /*! - * @brief Divide two vector registers, element wise - * @param x Vector to subctract from this register - * @return Value of (*this)+x - */ - RAJA_INLINE - self_type operator/(self_type const &x) const - { - self_type result = *this; - result /= x; - return result; - } - - /*! - * @brief Divide this vector by another vector - * @param x Vector to divide by - * @return Value of (*this)+x - */ - RAJA_INLINE - self_type const &operator/=(self_type const &x) - { - for(size_t i = 0;i < s_num_registers;++ i){ - m_registers[i] /= x.m_registers[i]; - } - m_length = std::min(m_length, x.m_length); - return *this; - } - - /*! - * @brief Sum the elements of this vector - * @return Sum of the values of the vectors scalar elements - */ - RAJA_INLINE - element_type sum() const - { - element_type result = (element_type)0; - if(m_length == s_num_elem){ - for(size_t i = 0;i < s_num_registers;++ i){ - result += m_registers[i].sum(); - } - } - else{ - for(size_t i = 0;i < m_length;++ i){ - result += (*this)[i]; - } - } - return result; - } - - - /*! - * @brief Dot product of two vectors - * @param x Other vector to dot with this vector - * @return Value of (*this) dot x - */ - RAJA_INLINE - element_type dot(self_type const &x) const - { - self_type z = (*this) * x; - return z.sum(); - } - - - - /*! - * @brief Returns the largest element - * @return The largest scalar element in the register - */ - RAJA_INLINE - element_type max() const - { - if(m_length == s_num_elem){ - element_type result = m_registers[0].max(); - for(size_t i = 1;i < s_num_registers;++ i){ - result = std::max(result, m_registers[i].max()); - } - return result; - } - else{ - element_type result = (*this)[0]; - for(size_t i = 0;i < m_length;++ i){ - result = std::max(result, (*this)[i]); - } - return result; - } - } - - - - /*! - * @brief Returns the largest element - * @return The largest scalar element in the register - */ - RAJA_INLINE - element_type min() const - { - if(m_length == s_num_elem){ - element_type result = m_registers[0].min(); - for(size_t i = 1;i < s_num_registers;++ i){ - result = std::min(result, m_registers[i].min()); - } - return result; - } - else{ - element_type result = (*this)[0]; - for(size_t i = 0;i < m_length;++ i){ - result = std::min(result, (*this)[i]); - } - return result; - } - } - - }; - - - template - StreamVector - operator+(ST x, StreamVector const &y){ - return StreamVector(x) + y; - } - - template - StreamVector - operator-(ST x, StreamVector const &y){ - return StreamVector(x) - y; - } - - template - StreamVector - operator*(ST x, StreamVector const &y){ - return StreamVector(x) * y; - } - - template - StreamVector - operator/(ST x, StreamVector const &y){ - return StreamVector(x) / y; - } - -} // namespace RAJA - - -#endif diff --git a/include/RAJA/pattern/vector/FixedVector.hpp b/include/RAJA/pattern/vector/Vector.hpp similarity index 61% rename from include/RAJA/pattern/vector/FixedVector.hpp rename to include/RAJA/pattern/vector/Vector.hpp index 2761ab3800..3f8c00788b 100644 --- a/include/RAJA/pattern/vector/FixedVector.hpp +++ b/include/RAJA/pattern/vector/Vector.hpp @@ -15,8 +15,8 @@ // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -#ifndef RAJA_pattern_vector_fixedvector_HPP -#define RAJA_pattern_vector_fixedvector_HPP +#ifndef RAJA_pattern_vector_vector_HPP +#define RAJA_pattern_vector_vector_HPP #include "RAJA/config.hpp" @@ -35,27 +35,32 @@ namespace RAJA * */ - template - class FixedVector; + template + class Vector; - template class REGISTER_TYPE, typename REGISTER_POLICY, typename ELEMENT_TYPE, size_t NUM_REG_ELEM, size_t NUM_ELEM> - class FixedVector, NUM_ELEM> + template class REGISTER_TYPE, typename REGISTER_POLICY, typename ELEMENT_TYPE, size_t NUM_REG_ELEM, size_t NUM_ELEM, bool FIXED_LENGTH> + class Vector, NUM_ELEM, FIXED_LENGTH> { public: using full_register_type = REGISTER_TYPE; static constexpr size_t s_num_register_elem = NUM_REG_ELEM; - using self_type = FixedVector; + using self_type = Vector; using element_type = ELEMENT_TYPE; - static constexpr size_t s_is_fixed = true; + + static constexpr size_t s_is_fixed = FIXED_LENGTH; static constexpr size_t s_num_elem = NUM_ELEM; static constexpr size_t s_byte_width = sizeof(element_type); static constexpr size_t s_bit_width = s_byte_width*8; + static_assert(s_num_elem % s_num_register_elem == 0 || s_is_fixed, + "Vector must use a whole number of registers if it's variable length"); + + static constexpr size_t s_num_full_registers = s_num_elem / s_num_register_elem; static constexpr size_t s_num_full_elem = s_num_full_registers*s_num_register_elem; @@ -71,28 +76,32 @@ namespace RAJA private: std::array m_full_registers; std::array m_partial_register; + + size_t m_length; public: /*! * @brief Default constructor, zeros register contents */ - FixedVector() = default; + RAJA_INLINE + Vector() : m_length(s_num_elem){} /*! * @brief Copy constructor */ RAJA_INLINE - FixedVector(self_type const &c) : + Vector(self_type const &c) : m_full_registers(c.m_full_registers), - m_partial_register(c.m_partial_register) + m_partial_register(c.m_partial_register), + m_length(c.m_length) {} /*! * @brief Scalar constructor (broadcast) */ RAJA_INLINE - FixedVector(element_type const &c) + Vector(element_type const &c) { for(size_t i = 0;i < s_num_full_registers;++ i){ m_full_registers[i] = c; @@ -100,33 +109,20 @@ namespace RAJA if(s_num_partial_registers){ m_partial_register[0] = c; } + m_length = s_num_elem; } - /*! - * @brief Load constructor, assuming scalars are in consecutive memory - * locations. - */ -// RAJA_INLINE -// void load(element_type const *ptr){ -// for(size_t i = 0;i < s_num_full_registers;++ i){ -// m_full_registers[i].load(ptr + i*s_num_register_elem); -// } -// if(s_num_partial_registers){ -// m_partial_register[0].load(ptr + s_num_full_elem); -// } -// } + /*! * @brief Strided load constructor, when scalars are located in memory * locations ptr, ptr+stride, ptr+2*stride, etc. * - * - * Note: this could be done with "gather" instructions if they are - * available. (like in avx2, but not in avx) */ RAJA_INLINE void load(element_type const *ptr, size_t stride = 1){ + m_length = s_num_elem; for(size_t i = 0;i < s_num_full_registers;++ i){ m_full_registers[i].load(ptr + i*stride*s_num_register_elem, stride); } @@ -139,28 +135,23 @@ namespace RAJA * @brief Load constructor, assuming scalars are in consecutive memory * locations. * - * Since this is a Fixed length vector, the length arguments is ignored + * For fixed length vectors, the length arguments is ignored, otherwise + * only the specified number of values is read in. */ RAJA_INLINE - void load_n(element_type const *ptr, size_t , size_t stride = 1){ - load(ptr, stride); + void load_n(element_type const *ptr, size_t length, size_t stride = 1){ + m_length = length; + if(s_is_fixed || length == s_num_elem){ + load(ptr, stride); + } + else{ + for(size_t i = 0;i < length;++ i){ + set(i, ptr[i*stride]); + } + } } - /*! - * @brief Store operation, assuming scalars are in consecutive memory - * locations. - */ -// RAJA_INLINE -// void store(element_type *ptr) const{ -// for(size_t i = 0;i < s_num_full_registers;++ i){ -// m_full_registers[i].store(ptr + i*s_num_register_elem); -// } -// if(s_num_partial_registers){ -// m_partial_register[0].store(ptr + s_num_full_elem); -// } -// } - /*! * @brief Strided store operation, where scalars are stored in memory * locations ptr, ptr+stride, ptr+2*stride, etc. @@ -171,11 +162,18 @@ namespace RAJA */ RAJA_INLINE void store(element_type *ptr, size_t stride = 1) const{ - for(size_t i = 0;i < s_num_full_registers;++ i){ - m_full_registers[i].store(ptr + i*stride*s_num_register_elem, stride); + if(s_is_fixed || m_length == s_num_elem){ + for(size_t i = 0;i < s_num_full_registers;++ i){ + m_full_registers[i].store(ptr + i*stride*s_num_register_elem, stride); + } + if(s_num_partial_registers){ + m_partial_register[0].store(ptr + stride*s_num_full_elem, stride); + } } - if(s_num_partial_registers){ - m_partial_register[0].store(ptr + stride*s_num_full_elem, stride); + else{ + for(size_t i = 0;i < m_length;++ i){ + ptr[i*stride] = (*this)[i]; + } } } @@ -195,10 +193,12 @@ namespace RAJA // compute the element in the register (equiv: i % s_num_register_elem) size_t e = i - (r*s_num_register_elem); - if(r < s_num_full_registers){ + if(!s_is_fixed || r < s_num_full_registers){ return m_full_registers[r][e]; } - return m_partial_register[0][e]; + else{ + return m_partial_register[0][e]; + } } @@ -216,7 +216,7 @@ namespace RAJA // compute the element in the register (equiv: i % s_num_register_elem) size_t e = i - (r*s_num_register_elem); - if(r < s_num_full_registers){ + if(!s_is_fixed || r < s_num_full_registers){ m_full_registers[r].set(e, value); } else{ @@ -237,7 +237,7 @@ namespace RAJA if(s_num_partial_registers){ m_partial_register[0] = value; } - return *this; + m_length = s_num_elem; } /*! @@ -248,12 +248,12 @@ namespace RAJA RAJA_INLINE self_type const &operator=(self_type const &x) { - for(size_t i = 0;i < s_num_full_registers;++ i){ - m_full_registers[i] = x.m_full_registers[i]; - } - if(s_num_partial_registers){ - m_partial_register[0] = x.m_partial_register[0]; + m_full_registers = x.m_full_registers; + if(s_is_fixed && s_num_partial_registers){ + m_partial_register = x.m_partial_register; } + m_length = x.m_length; + return *this; } @@ -266,15 +266,8 @@ namespace RAJA RAJA_INLINE self_type operator+(self_type const &x) const { - self_type result(*this); - - for(size_t i = 0;i < s_num_full_registers;++ i){ - result.m_full_registers[i] += x.m_full_registers[i]; - } - if(s_num_partial_registers){ - result.m_partial_register[0] += x.m_partial_register[0]; - } - + self_type result = *this; + result += x; return result; } @@ -289,9 +282,10 @@ namespace RAJA for(size_t i = 0;i < s_num_full_registers;++ i){ m_full_registers[i] += x.m_full_registers[i]; } - if(s_num_partial_registers){ + if(s_is_fixed && s_num_partial_registers){ m_partial_register[0] += x.m_partial_register[0]; } + m_length = std::min(m_length, x.m_length); return *this; } @@ -304,15 +298,8 @@ namespace RAJA RAJA_INLINE self_type operator-(self_type const &x) const { - self_type result(*this); - - for(size_t i = 0;i < s_num_full_registers;++ i){ - result.m_full_registers[i] -= x.m_full_registers[i]; - } - if(s_num_partial_registers){ - result.m_partial_register[0] -= x.m_partial_register[0]; - } - + self_type result = *this; + result -= x; return result; } @@ -327,9 +314,10 @@ namespace RAJA for(size_t i = 0;i < s_num_full_registers;++ i){ m_full_registers[i] -= x.m_full_registers[i]; } - if(s_num_partial_registers){ + if(s_is_fixed && s_num_partial_registers){ m_partial_register[0] -= x.m_partial_register[0]; } + m_length = std::min(m_length, x.m_length); return *this; } @@ -342,15 +330,8 @@ namespace RAJA RAJA_INLINE self_type operator*(self_type const &x) const { - self_type result(*this); - - for(size_t i = 0;i < s_num_full_registers;++ i){ - result.m_full_registers[i] *= x.m_full_registers[i]; - } - if(s_num_partial_registers){ - result.m_partial_register[0] *= x.m_partial_register[0]; - } - + self_type result = *this; + result *= x; return result; } @@ -365,9 +346,10 @@ namespace RAJA for(size_t i = 0;i < s_num_full_registers;++ i){ m_full_registers[i] *= x.m_full_registers[i]; } - if(s_num_partial_registers){ + if(s_is_fixed && s_num_partial_registers){ m_partial_register[0] *= x.m_partial_register[0]; } + m_length = std::min(m_length, x.m_length); return *this; } @@ -380,15 +362,8 @@ namespace RAJA RAJA_INLINE self_type operator/(self_type const &x) const { - self_type result(*this); - - for(size_t i = 0;i < s_num_full_registers;++ i){ - result.m_full_registers[i] /= x.m_full_registers[i]; - } - if(s_num_partial_registers){ - result.m_partial_register[0] /= x.m_partial_register[0]; - } - + self_type result = *this; + result /= x; return result; } @@ -403,9 +378,10 @@ namespace RAJA for(size_t i = 0;i < s_num_full_registers;++ i){ m_full_registers[i] /= x.m_full_registers[i]; } - if(s_num_partial_registers){ + if(s_is_fixed && s_num_partial_registers){ m_partial_register[0] /= x.m_partial_register[0]; } + m_length = std::min(m_length, x.m_length); return *this; } @@ -418,11 +394,18 @@ namespace RAJA element_type sum() const { element_type result = (element_type)0; - for(size_t i = 0;i < s_num_full_registers;++ i){ - result += m_full_registers[i].sum(); + if(m_length == s_num_elem){ + for(size_t i = 0;i < s_num_full_registers;++ i){ + result += m_full_registers[i].sum(); + } + if(s_num_partial_registers){ + result += m_partial_register[0].sum(); + } } - if(s_num_partial_registers){ - result += m_partial_register[0].sum(); + else{ + for(size_t i = 0;i < m_length;++ i){ + result += (*this)[i]; + } } return result; } @@ -431,18 +414,14 @@ namespace RAJA * @brief Dot product of two vectors * @param x Other vector to dot with this vector * @return Value of (*this) dot x + * + * NOTE: we could really do something more optimized here! */ RAJA_INLINE element_type dot(self_type const &x) const { - element_type result = (element_type)0; - for(size_t i = 0;i < s_num_full_registers;++ i){ - result += m_full_registers[i].dot(x.m_full_registers[i]); - } - if(s_num_partial_registers){ - result += m_partial_register[0].dot(x.m_partial_register[0]); - } - return result; + self_type z = (*this) * x; + return z.sum(); } @@ -453,18 +432,27 @@ namespace RAJA RAJA_INLINE element_type max() const { - if(s_num_full_registers == 0){ - return m_partial_register[0].max(); + if(s_is_fixed || m_length == s_num_elem){ + if(s_num_full_registers == 0){ + return m_partial_register[0].max(); + } + + element_type result = (element_type)m_full_registers[0].max(); + for(size_t i = 1;i < s_num_full_registers;++ i){ + result = std::max(result, m_full_registers[i].max()); + } + if(s_num_partial_registers){ + result = std::max(result, m_partial_register[0].max()); + } + return result; } - - element_type result = (element_type)m_full_registers[0].max(); - for(size_t i = 1;i < s_num_full_registers;++ i){ - result = std::max(result, m_full_registers[i].max()); - } - if(s_num_partial_registers){ - result = std::max(result, m_partial_register[0].max()); + else{ + element_type result = (*this)[0]; + for(size_t i = 0;i < m_length;++ i){ + result = std::max(result, (*this)[i]); + } + return result; } - return result; } /*! @@ -474,46 +462,62 @@ namespace RAJA RAJA_INLINE element_type min() const { - if(s_num_full_registers == 0){ - return m_partial_register[0].min(); + if(s_is_fixed || m_length == s_num_elem){ + if(s_num_full_registers == 0){ + return m_partial_register[0].min(); + } + + element_type result = (element_type)m_full_registers[0].min(); + for(size_t i = 1;i < s_num_full_registers;++ i){ + result = std::min(result, m_full_registers[i].min()); + } + if(s_num_partial_registers){ + result = std::min(result, m_partial_register[0].min()); + } + return result; } - - element_type result = (element_type)m_full_registers[0].min(); - for(size_t i = 1;i < s_num_full_registers;++ i){ - result = std::min(result, m_full_registers[i].min()); - } - if(s_num_partial_registers){ - result = std::min(result, m_partial_register[0].min()); + else{ + element_type result = (*this)[0]; + for(size_t i = 0;i < m_length;++ i){ + result = std::min(result, (*this)[i]); + } + return result; } - return result; } }; - template - FixedVector - operator+(ST x, FixedVector const &y){ - return FixedVector(x) + y; + template + using FixedVector = Vector; + + template + using StreamVector = Vector; + + + template + Vector + operator+(ST x, Vector const &y){ + return Vector(x) + y; } - template - FixedVector - operator-(ST x, FixedVector const &y){ - return FixedVector(x) - y; + template + Vector + operator-(ST x, Vector const &y){ + return Vector(x) - y; } - template - FixedVector - operator*(ST x, FixedVector const &y){ - return FixedVector(x) * y; + template + Vector + operator*(ST x, Vector const &y){ + return Vector(x) * y; } - template - FixedVector - operator/(ST x, FixedVector const &y){ - return FixedVector(x) / y; + template + Vector + operator/(ST x, Vector const &y){ + return Vector(x) / y; } } // namespace RAJA diff --git a/include/RAJA/pattern/vector/VectorRef.hpp b/include/RAJA/pattern/vector/VectorRef.hpp index 70cf8df96c..1b8c7cf221 100644 --- a/include/RAJA/pattern/vector/VectorRef.hpp +++ b/include/RAJA/pattern/vector/VectorRef.hpp @@ -27,92 +27,6 @@ namespace RAJA { -// namespace internal -// { -// -// template -// struct VectorRefLoadStoreHelper; -// -// template -// struct VectorRefLoadStoreHelper, false>{ -// -// using vector_type = FixedVector; -// -// template -// RAJA_INLINE -// static vector_type load(PTR data, IDX, IDX stride){ -// vector_type value; -// value.load(data, stride); -// return value; -// } -// -// template -// RAJA_INLINE -// static void store(vector_type value, PTR data, IDX stride){ -// value.store(data, stride); -// } -// }; -// -// template -// struct VectorRefLoadStoreHelper, true>{ -// -// using vector_type = FixedVector; -// -// template -// RAJA_INLINE -// static vector_type load(PTR data, IDX, IDX){ -// vector_type value; -// value.load(data); -// return value; -// } -// -// template -// RAJA_INLINE -// static void store(vector_type value, PTR data, IDX){ -// value.store(data); -// } -// }; -// -// template -// struct VectorRefLoadStoreHelper, false>{ -// -// using vector_type = StreamVector; -// -// template -// RAJA_INLINE -// static vector_type load(PTR data, IDX length, IDX stride){ -// vector_type value; -// value.load_n(data, length, stride); -// return value; -// } -// -// template -// RAJA_INLINE -// static void store(vector_type value, PTR data, IDX stride){ -// value.store(data, stride); -// } -// }; -// -// template -// struct VectorRefLoadStoreHelper, true>{ -// -// using vector_type = StreamVector; -// -// template -// RAJA_INLINE -// static vector_type load(PTR data, IDX length, IDX){ -// vector_type value; -// value.load_n(data, length); -// return value; -// } -// -// template -// RAJA_INLINE -// static void store(vector_type value, PTR data, IDX){ -// value.store(data); -// } -// }; -// } /*! * \file @@ -181,7 +95,6 @@ namespace RAJA RAJA_INLINE void store(vector_type value) const { -// internal::VectorRefLoadStoreHelper::store(value, m_data+m_linear_index, m_stride); if(STRIDE_ONE){ value.store(m_data+m_linear_index); } diff --git a/test/unit/test-register.cpp b/test/unit/test-register.cpp index 1086d57a5c..b8a4ba4b09 100644 --- a/test/unit/test-register.cpp +++ b/test/unit/test-register.cpp @@ -12,8 +12,6 @@ #include "RAJA/RAJA.hpp" #include "gtest/gtest.h" -#include "RAJA/pattern/register.hpp" - using RegisterTestTypes = ::testing::Types< RAJA::Register, diff --git a/test/unit/test-vector.cpp b/test/unit/test-vector.cpp index 662ec7fe44..d894e5805b 100644 --- a/test/unit/test-vector.cpp +++ b/test/unit/test-vector.cpp @@ -12,8 +12,6 @@ #include "RAJA/RAJA.hpp" #include "gtest/gtest.h" -#include "RAJA/pattern/register.hpp" -#include "RAJA/pattern/vector.hpp" using VectorTestTypes = ::testing::Types< @@ -23,6 +21,9 @@ using VectorTestTypes = ::testing::Types< RAJA::StreamVector, 8>>; +//using VectorTestTypes = ::testing::Types< +// RAJA::FixedVector, 8>>; + template class VectorTest : public ::testing::Test { From 48d34da238bfc1428d000deb7c1ab31e3d80c373 Mon Sep 17 00:00:00 2001 From: "Adam J. Kunen" Date: Fri, 8 Nov 2019 16:09:41 -0800 Subject: [PATCH 012/593] Fixed OffsetLayout to provide its linear index type --- include/RAJA/util/OffsetLayout.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/include/RAJA/util/OffsetLayout.hpp b/include/RAJA/util/OffsetLayout.hpp index b6011dbd74..160c3aa567 100644 --- a/include/RAJA/util/OffsetLayout.hpp +++ b/include/RAJA/util/OffsetLayout.hpp @@ -46,6 +46,7 @@ template struct OffsetLayout_impl, IdxLin> { using Self = OffsetLayout_impl, IdxLin>; using IndexRange = camp::idx_seq; + using IndexLinear = IdxLin; using Base = detail::LayoutBase_impl; Base base_; From b68377cf2a6ebb30bdf7702eccd66144766db687 Mon Sep 17 00:00:00 2001 From: "Adam J. Kunen" Date: Fri, 8 Nov 2019 17:23:51 -0800 Subject: [PATCH 013/593] Fixed to correctly fall back on scalar if SIMD is not enabled, also added better -march=native flags to compiler scripts --- host-configs/lc-builds/toss3/clang_X.cmake | 4 ++-- host-configs/lc-builds/toss3/gcc_X.cmake | 4 ++-- .../lc-builds/toss3/icpc_X_gcc7headers.cmake | 4 ++-- .../lc-builds/toss3/icpc_X_gcc8headers.cmake | 4 ++-- include/RAJA/policy/simd/register.hpp | 19 ++++++++++++++++++- test/unit/test-register.cpp | 9 +++++++-- test/unit/test-vector.cpp | 9 +++++---- tpl/camp | 2 +- 8 files changed, 39 insertions(+), 16 deletions(-) diff --git a/host-configs/lc-builds/toss3/clang_X.cmake b/host-configs/lc-builds/toss3/clang_X.cmake index b892855cca..ad1e753e95 100755 --- a/host-configs/lc-builds/toss3/clang_X.cmake +++ b/host-configs/lc-builds/toss3/clang_X.cmake @@ -7,8 +7,8 @@ set(RAJA_COMPILER "RAJA_COMPILER_CLANG" CACHE STRING "") -set(CMAKE_CXX_FLAGS_RELEASE "-O3 -msse4.2 -funroll-loops -finline-functions" CACHE STRING "") -set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -msse4.2 -funroll-loops -finline-functions" CACHE STRING "") +set(CMAKE_CXX_FLAGS_RELEASE "-O3 -march=native -funroll-loops -finline-functions" CACHE STRING "") +set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -march=native -funroll-loops -finline-functions" CACHE STRING "") set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g" CACHE STRING "") set(RAJA_RANGE_ALIGN 4 CACHE STRING "") diff --git a/host-configs/lc-builds/toss3/gcc_X.cmake b/host-configs/lc-builds/toss3/gcc_X.cmake index 6dc135302c..fc22c00ef6 100755 --- a/host-configs/lc-builds/toss3/gcc_X.cmake +++ b/host-configs/lc-builds/toss3/gcc_X.cmake @@ -7,8 +7,8 @@ set(RAJA_COMPILER "RAJA_COMPILER_GNU" CACHE STRING "") -set(CMAKE_CXX_FLAGS_RELEASE "-Ofast -finline-functions -finline-limit=20000" CACHE STRING "") -set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-Ofast -g -finline-functions -finline-limit=20000" CACHE STRING "") +set(CMAKE_CXX_FLAGS_RELEASE "-Ofast -march=native -finline-functions -finline-limit=20000" CACHE STRING "") +set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-Ofast -march=native -g -finline-functions -finline-limit=20000" CACHE STRING "") set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g" CACHE STRING "") set(RAJA_RANGE_ALIGN 4 CACHE STRING "") diff --git a/host-configs/lc-builds/toss3/icpc_X_gcc7headers.cmake b/host-configs/lc-builds/toss3/icpc_X_gcc7headers.cmake index 3585a4d7d2..a37613657e 100755 --- a/host-configs/lc-builds/toss3/icpc_X_gcc7headers.cmake +++ b/host-configs/lc-builds/toss3/icpc_X_gcc7headers.cmake @@ -9,8 +9,8 @@ set(RAJA_COMPILER "RAJA_COMPILER_ICC" CACHE STRING "") set(COMMON_FLAGS "-gxx-name=/usr/tce/packages/gcc/gcc-7.1.0/bin/g++") -set(CMAKE_CXX_FLAGS_RELEASE "${COMMON_FLAGS} -O3 -fp-model source -unroll-aggressive -finline-functions -axCORE-AVX2 -diag-disable cpu-dispatch" CACHE STRING "") -set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${COMMON_FLAGS} -O3 -g -fp-model source -unroll-aggressive -finline-functions -axCORE-AVX2 -diag-disable cpu-dispatch" CACHE STRING "") +set(CMAKE_CXX_FLAGS_RELEASE "${COMMON_FLAGS} -O3 -fp-model source -unroll-aggressive -finline-functions -march=native -diag-disable cpu-dispatch" CACHE STRING "") +set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${COMMON_FLAGS} -O3 -g -fp-model source -unroll-aggressive -finline-functions -march=native -diag-disable cpu-dispatch" CACHE STRING "") set(CMAKE_CXX_FLAGS_DEBUG "${COMMON_FLAGS} -O0 -g" CACHE STRING "") set(RAJA_RANGE_ALIGN 4 CACHE STRING "") diff --git a/host-configs/lc-builds/toss3/icpc_X_gcc8headers.cmake b/host-configs/lc-builds/toss3/icpc_X_gcc8headers.cmake index fae517cd45..5c124cfe91 100755 --- a/host-configs/lc-builds/toss3/icpc_X_gcc8headers.cmake +++ b/host-configs/lc-builds/toss3/icpc_X_gcc8headers.cmake @@ -9,8 +9,8 @@ set(RAJA_COMPILER "RAJA_COMPILER_ICC" CACHE STRING "") set(COMMON_FLAGS "-gxx-name=/usr/tce/packages/gcc/gcc-8.1.0/bin/g++") -set(CMAKE_CXX_FLAGS_RELEASE "${COMMON_FLAGS} -O3 -march=native -ansi-alias -axCORE-AVX2 -diag-disable cpu-dispatch" CACHE STRING "") -set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${COMMON_FLAGS} -O3 -g -march=native -ansi-alias -axCORE-AVX2 -diag-disable cpu-dispatch" CACHE STRING "") +set(CMAKE_CXX_FLAGS_RELEASE "${COMMON_FLAGS} -O3 -march=native -ansi-alias -march=native -diag-disable cpu-dispatch" CACHE STRING "") +set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${COMMON_FLAGS} -O3 -g -march=native -ansi-alias -march=native -diag-disable cpu-dispatch" CACHE STRING "") set(CMAKE_CXX_FLAGS_DEBUG "${COMMON_FLAGS} -O0 -g" CACHE STRING "") set(RAJA_RANGE_ALIGN 4 CACHE STRING "") diff --git a/include/RAJA/policy/simd/register.hpp b/include/RAJA/policy/simd/register.hpp index 7ce43a3f67..40ba0d2a0f 100644 --- a/include/RAJA/policy/simd/register.hpp +++ b/include/RAJA/policy/simd/register.hpp @@ -23,7 +23,24 @@ #ifdef __AVX__ #include +#ifndef RAJA_SIMD_REGISTER_TYPE +#define RAJA_SIMD_REGISTER_TYPE simd_avx_register +#define RAJA_SIMD_REGISTER_WIDTH 256 #endif +#endif + + +namespace RAJA +{ + struct simd_scalar_register {}; +} + + +#ifndef RAJA_SIMD_REGISTER_TYPE +#define RAJA_SIMD_REGISTER_TYPE RAJA::simd_scalar_register +#define RAJA_SIMD_REGISTER_WIDTH 0 +#endif + namespace RAJA { @@ -34,7 +51,7 @@ namespace policy // This sets the default SIMD register that will be used // Individual registers can - using simd_register = simd_avx_register; + using simd_register = RAJA_SIMD_REGISTER_TYPE; } } diff --git a/test/unit/test-register.cpp b/test/unit/test-register.cpp index b8a4ba4b09..ec9eca7179 100644 --- a/test/unit/test-register.cpp +++ b/test/unit/test-register.cpp @@ -12,7 +12,12 @@ #include "RAJA/RAJA.hpp" #include "gtest/gtest.h" - +#if RAJA_SIMD_REGISTER_WIDTH == 0 + using RegisterTestTypes = ::testing::Types< + RAJA::Register, + RAJA::Register, + RAJA::Register>; +#else using RegisterTestTypes = ::testing::Types< RAJA::Register, RAJA::Register, @@ -28,7 +33,7 @@ using RegisterTestTypes = ::testing::Types< RAJA::StreamVector, 8>, RAJA::StreamVector, 12>, RAJA::StreamVector, 16>>; - +#endif template class RegisterTest : public ::testing::Test diff --git a/test/unit/test-vector.cpp b/test/unit/test-vector.cpp index d894e5805b..b36cc2cda3 100644 --- a/test/unit/test-vector.cpp +++ b/test/unit/test-vector.cpp @@ -13,17 +13,18 @@ #include "gtest/gtest.h" - +#if RAJA_SIMD_REGISTER_WIDTH == 0 +using VectorTestTypes = ::testing::Types< + RAJA::FixedVector, 4>>; +#else using VectorTestTypes = ::testing::Types< RAJA::FixedVector, 4>, RAJA::FixedVector, 8>, RAJA::StreamVector, 4>, RAJA::StreamVector, 8>>; +#endif -//using VectorTestTypes = ::testing::Types< -// RAJA::FixedVector, 8>>; - template class VectorTest : public ::testing::Test { diff --git a/tpl/camp b/tpl/camp index 1adb82ed1e..fb4e369cdd 160000 --- a/tpl/camp +++ b/tpl/camp @@ -1 +1 @@ -Subproject commit 1adb82ed1e9595fe94fecfb8a5d3f4c71df5cca7 +Subproject commit fb4e369cdd8f19cadd6e2f3b33a6e82ceedcc8a2 From c6a476fbce079b8263f331b462b7462847333810 Mon Sep 17 00:00:00 2001 From: "Adam J. Kunen" Date: Mon, 11 Nov 2019 09:19:49 -0800 Subject: [PATCH 014/593] Added avx2, mostly the same as avx but highlights how to extend to other instruction sets --- include/RAJA/policy/simd/register.hpp | 11 +- include/RAJA/policy/simd/register/avx.hpp | 7 +- include/RAJA/policy/simd/register/avx2.hpp | 35 ++ .../policy/simd/register/avx2_double2.hpp | 340 +++++++++++++++++ .../policy/simd/register/avx2_double3.hpp | 360 ++++++++++++++++++ .../policy/simd/register/avx2_double4.hpp | 360 ++++++++++++++++++ .../RAJA/policy/simd/register/avx_double2.hpp | 7 +- .../RAJA/policy/simd/register/avx_double3.hpp | 7 +- .../RAJA/policy/simd/register/avx_double4.hpp | 10 +- test/unit/test-register.cpp | 45 ++- test/unit/test-vector.cpp | 18 +- 11 files changed, 1164 insertions(+), 36 deletions(-) create mode 100644 include/RAJA/policy/simd/register/avx2.hpp create mode 100644 include/RAJA/policy/simd/register/avx2_double2.hpp create mode 100644 include/RAJA/policy/simd/register/avx2_double3.hpp create mode 100644 include/RAJA/policy/simd/register/avx2_double4.hpp diff --git a/include/RAJA/policy/simd/register.hpp b/include/RAJA/policy/simd/register.hpp index 40ba0d2a0f..7cfd234e07 100644 --- a/include/RAJA/policy/simd/register.hpp +++ b/include/RAJA/policy/simd/register.hpp @@ -21,11 +21,19 @@ #include #include + +#ifdef __AVX2__ +#include +#ifndef RAJA_SIMD_REGISTER_TYPE +#define RAJA_SIMD_REGISTER_TYPE simd_avx2_register +#endif +#endif + + #ifdef __AVX__ #include #ifndef RAJA_SIMD_REGISTER_TYPE #define RAJA_SIMD_REGISTER_TYPE simd_avx_register -#define RAJA_SIMD_REGISTER_WIDTH 256 #endif #endif @@ -38,7 +46,6 @@ namespace RAJA #ifndef RAJA_SIMD_REGISTER_TYPE #define RAJA_SIMD_REGISTER_TYPE RAJA::simd_scalar_register -#define RAJA_SIMD_REGISTER_WIDTH 0 #endif diff --git a/include/RAJA/policy/simd/register/avx.hpp b/include/RAJA/policy/simd/register/avx.hpp index 715fd3f622..3ab0de36c4 100644 --- a/include/RAJA/policy/simd/register/avx.hpp +++ b/include/RAJA/policy/simd/register/avx.hpp @@ -3,7 +3,7 @@ * * \file * - * \brief Header file containing RAJA simd policy definitions. + * \brief Header file containing SIMD abstractions for AVX * ****************************************************************************** */ @@ -15,6 +15,8 @@ // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +#ifdef __AVX__ + #ifndef RAJA_policy_simd_register_avx_HPP #define RAJA_policy_simd_register_avx_HPP @@ -28,3 +30,6 @@ namespace RAJA { #include #include #include + + +#endif // __AVX__ diff --git a/include/RAJA/policy/simd/register/avx2.hpp b/include/RAJA/policy/simd/register/avx2.hpp new file mode 100644 index 0000000000..4ab996117d --- /dev/null +++ b/include/RAJA/policy/simd/register/avx2.hpp @@ -0,0 +1,35 @@ +/*! + ****************************************************************************** + * + * \file + * + * \brief Header file containing SIMD abstractions for AVX2 + * + ****************************************************************************** + */ + +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-19, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#ifdef __AVX2__ + +#ifndef RAJA_policy_simd_register_avx2_HPP +#define RAJA_policy_simd_register_avx2_HPP + +namespace RAJA { + struct simd_avx2_register {}; +} + + +#endif + +#include +#include +#include + + +#endif // __AVX2__ diff --git a/include/RAJA/policy/simd/register/avx2_double2.hpp b/include/RAJA/policy/simd/register/avx2_double2.hpp new file mode 100644 index 0000000000..48787e8a89 --- /dev/null +++ b/include/RAJA/policy/simd/register/avx2_double2.hpp @@ -0,0 +1,340 @@ +/*! + ****************************************************************************** + * + * \file + * + * \brief RAJA header file defining a SIMD register abstraction. + * + ****************************************************************************** + */ + +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-19, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#ifdef __AVX2__ + +#ifndef RAJA_policy_simd_register_avx2_double2_HPP +#define RAJA_policy_simd_register_avx2_double2_HPP + +#include "RAJA/config.hpp" +#include "RAJA/util/macros.hpp" + +// Include SIMD intrinsics header file +#include +#include + + +namespace RAJA +{ + + + template<> + class Register{ + public: + using self_type = Register; + using element_type = double; + + static constexpr size_t s_num_elem = 2; + static constexpr size_t s_byte_width = s_num_elem*sizeof(double); + static constexpr size_t s_bit_width = s_byte_width*8; + + using simd_type = __m128d; + + private: + simd_type m_value; + + public: + + /*! + * @brief Default constructor, zeros register contents + */ + Register() : m_value(_mm_setzero_pd()) { + } + + /*! + * @brief Copy constructor from underlying simd register + */ + explicit Register(simd_type const &c) : m_value(c) {} + + + /*! + * @brief Copy constructor + */ + Register(self_type const &c) : m_value(c.m_value) {} + + /*! + * @brief Construct from scalar. + * Sets all elements to same value (broadcast). + */ + RAJA_INLINE + Register(element_type const &c) : m_value(_mm_set1_pd(c)) {} + + /*! + * @brief Load operation, assuming scalars are in consecutive memory + * locations. + */ + void load(element_type const *ptr){ + m_value = _mm_loadu_pd(ptr); + } + + /*! + * @brief Strided load operation, when scalars are located in memory + * locations ptr, ptr+stride + * + * + * Note: this could be done with "gather" instructions if they are + * available. (like in avx2, but not in avx) + */ + void load(element_type const *ptr, size_t stride){ + m_value = _mm_set_pd(ptr[stride], ptr[0]); + } + + + /*! + * @brief Store operation, assuming scalars are in consecutive memory + * locations. + */ + void store(element_type *ptr) const{ + _mm_storeu_pd(ptr, m_value); + } + + /*! + * @brief Strided store operation, where scalars are stored in memory + * locations ptr, ptr+stride, ptr+2*stride, etc. + * + * + * Note: this could be done with "scatter" instructions if they are + * available. + */ + void store(element_type *ptr, size_t stride) const{ + ptr[0] = m_value[0]; + ptr[stride] = m_value[1]; + } + + /*! + * @brief Get scalar value from vector register + * @param i Offset of scalar to get + * @return Returns scalar value at i + */ + template + constexpr + RAJA_INLINE + element_type operator[](IDX i) const + {return m_value[i];} + + + /*! + * @brief Set scalar value in vector register + * @param i Offset of scalar to set + * @param value Value of scalar to set + */ + template + RAJA_INLINE + void set(IDX i, element_type value) + {m_value[i] = value;} + + /*! + * @brief Set entire vector to a single scalar value + * @param value Value to set all vector elements to + */ + RAJA_INLINE + self_type const &operator=(element_type value) + { + m_value = _mm_set1_pd(value); + return *this; + } + + /*! + * @brief Assign one register to antoher + * @param x Vector to copy + * @return Value of (*this) + */ + RAJA_INLINE + self_type const &operator=(self_type const &x) + { + m_value = x.m_value; + return *this; + } + + + /*! + * @brief Add two vector registers + * @param x Vector to add to this register + * @return Value of (*this)+x + */ + RAJA_INLINE + self_type operator+(self_type const &x) const + { + return self_type(_mm_add_pd(m_value, x.m_value)); + } + + /*! + * @brief Add a vector to this vector + * @param x Vector to add to this register + * @return Value of (*this)+x + */ + RAJA_INLINE + self_type const &operator+=(self_type const &x) + { + m_value = _mm_add_pd(m_value, x.m_value); + return *this; + } + + /*! + * @brief Subtract two vector registers + * @param x Vector to subctract from this register + * @return Value of (*this)+x + */ + RAJA_INLINE + self_type operator-(self_type const &x) const + { + return self_type(_mm_sub_pd(m_value, x.m_value)); + } + + /*! + * @brief Subtract a vector from this vector + * @param x Vector to subtract from this register + * @return Value of (*this)+x + */ + RAJA_INLINE + self_type const &operator-=(self_type const &x) + { + m_value = _mm_sub_pd(m_value, x.m_value); + return *this; + } + + /*! + * @brief Multiply two vector registers, element wise + * @param x Vector to subctract from this register + * @return Value of (*this)+x + */ + RAJA_INLINE + self_type operator*(self_type const &x) const + { + return self_type(_mm_mul_pd(m_value, x.m_value)); + } + + /*! + * @brief Multiply a vector with this vector + * @param x Vector to multiple with this register + * @return Value of (*this)+x + */ + RAJA_INLINE + self_type const &operator*=(self_type const &x) + { + m_value = _mm_mul_pd(m_value, x.m_value); + return *this; + } + + /*! + * @brief Divide two vector registers, element wise + * @param x Vector to subctract from this register + * @return Value of (*this)+x + */ + RAJA_INLINE + self_type operator/(self_type const &x) const + { + return self_type(_mm_div_pd(m_value, x.m_value)); + } + + /*! + * @brief Divide this vector by another vector + * @param x Vector to divide by + * @return Value of (*this)+x + */ + RAJA_INLINE + self_type const &operator/=(self_type const &x) + { + m_value = _mm_div_pd(m_value, x.m_value); + return *this; + } + + /*! + * @brief Sum the elements of this vector + * @return Sum of the values of the vectors scalar elements + */ + RAJA_INLINE + element_type sum() const + { + auto hsum = _mm_hadd_pd(m_value, m_value); + return hsum[0]; + } + + /*! + * @brief Dot product of two vectors + * @param x Other vector to dot with this vector + * @return Value of (*this) dot x + */ + RAJA_INLINE + element_type dot(self_type const &x) const + { + return self_type(_mm_mul_pd(m_value, x.m_value)).sum(); + } + + /*! + * @brief Returns the largest element + * @return The largest scalar element in the register + */ + RAJA_INLINE + element_type max() const + { + // swap the two lanes + simd_type a = _mm_permute_pd(m_value, 0x01); + + // take the max of each lane (should be same result in each lane) + simd_type b = _mm_max_pd(m_value, a); + + // return the lower lane + return b[0]; + } + + /*! + * @brief Returns element-wise largest values + * @return Vector of the element-wise max values + */ + RAJA_INLINE + self_type vmax(self_type a) const + { + return self_type(_mm_max_pd(m_value, a.m_value)); + } + + /*! + * @brief Returns the largest element + * @return The largest scalar element in the register + */ + RAJA_INLINE + element_type min() const + { + // swap the two lanes + simd_type a = _mm_permute_pd(m_value, 0x01); + + // take the max of each lane (should be same result in each lane) + simd_type b = _mm_min_pd(m_value, a); + + // return the lower lane + return b[0]; + } + + /*! + * @brief Returns element-wise largest values + * @return Vector of the element-wise max values + */ + RAJA_INLINE + self_type vmin(self_type a) const + { + return self_type(_mm_min_pd(m_value, a.m_value)); + } + }; + + + +} // namespace RAJA + + +#endif + +#endif //__AVX2__ diff --git a/include/RAJA/policy/simd/register/avx2_double3.hpp b/include/RAJA/policy/simd/register/avx2_double3.hpp new file mode 100644 index 0000000000..c42f73c276 --- /dev/null +++ b/include/RAJA/policy/simd/register/avx2_double3.hpp @@ -0,0 +1,360 @@ +/*! + ****************************************************************************** + * + * \file + * + * \brief RAJA header file defining a SIMD register abstraction. + * + ****************************************************************************** + */ + +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-19, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#ifdef __AVX2__ + +#ifndef RAJA_policy_simd_register_avx2_double3_HPP +#define RAJA_policy_simd_register_avx2_double3_HPP + +#include "RAJA/config.hpp" +#include "RAJA/util/macros.hpp" + +// Include SIMD intrinsics header file +#include +#include + + +namespace RAJA +{ + + + template<> + class Register{ + public: + using self_type = Register; + using element_type = double; + + static constexpr size_t s_num_elem = 3; + static constexpr size_t s_byte_width = s_num_elem*sizeof(double); + static constexpr size_t s_bit_width = s_byte_width*8; + + // Using a 256-bit (4 double) vector, but padding out the upper most + // value + using simd_type = __m256d; + + + private: + simd_type m_value; + + // Mask used to mask off the upper double from the vector + using mask_type = __m256i; + //static constexpr mask_type s_mask = (__m256i)(__v4di){ -1, -1, -1, 0}; + + public: + + /*! + * @brief Default constructor, zeros register contents + */ + RAJA_INLINE + Register() : m_value(_mm256_setzero_pd()) { + } + + /*! + * @brief Copy constructor from underlying simd register + */ + RAJA_INLINE + explicit Register(simd_type const &c) : m_value(c) {} + + + /*! + * @brief Copy constructor + */ + RAJA_INLINE + Register(self_type const &c) : m_value(c.m_value) {} + + /*! + * @brief Construct from scalar. + * Sets all elements to same value (broadcast). + */ + RAJA_INLINE + Register(element_type const &c) : m_value(_mm256_set1_pd(c)) {} + + /*! + * @brief Load constructor, assuming scalars are in consecutive memory + * locations. + */ + RAJA_INLINE + void load(element_type const *ptr){ + m_value = _mm256_maskload_pd(ptr, (__m256i)(__v4di){ -1, -1, -1, 0}); + } + + /*! + * @brief Strided load constructor, when scalars are located in memory + * locations ptr, ptr+stride, ptr+2*stride, etc. + * + * + * Note: this could be done with "gather" instructions if they are + * available. (like in avx2, but not in avx) + */ + RAJA_INLINE + void load(element_type const *ptr, size_t stride){ + m_value =_mm256_set_pd(0.0, + ptr[2*stride], + ptr[stride], + ptr[0]); + } + + + /*! + * @brief Store operation, assuming scalars are in consecutive memory + * locations. + */ + RAJA_INLINE + void store(element_type *ptr) const{ + _mm256_maskstore_pd(ptr, (__m256i)(__v4di){ -1, -1, -1, 0}, m_value); + } + + /*! + * @brief Strided store operation, where scalars are stored in memory + * locations ptr, ptr+stride, ptr+2*stride, etc. + * + * + * Note: this could be done with "scatter" instructions if they are + * available. + */ + RAJA_INLINE + void store(element_type *ptr, size_t stride) const{ + for(size_t i = 0;i < s_num_elem;++ i){ + ptr[i*stride] = m_value[i]; + } + } + + /*! + * @brief Get scalar value from vector register + * @param i Offset of scalar to get + * @return Returns scalar value at i + */ + template + constexpr + RAJA_INLINE + element_type operator[](IDX i) const + {return m_value[i];} + + + /*! + * @brief Set scalar value in vector register + * @param i Offset of scalar to set + * @param value Value of scalar to set + */ + template + RAJA_INLINE + void set(IDX i, element_type value) + {m_value[i] = value;} + + /*! + * @brief Set entire vector to a single scalar value + * @param value Value to set all vector elements to + */ + RAJA_INLINE + self_type const &operator=(element_type value) + { + m_value = _mm256_set1_pd(value); + return *this; + } + + /*! + * @brief Assign one register to antoher + * @param x Vector to copy + * @return Value of (*this) + */ + RAJA_INLINE + self_type const &operator=(self_type const &x) + { + m_value = x.m_value; + return *this; + } + + + /*! + * @brief Add two vector registers + * @param x Vector to add to this register + * @return Value of (*this)+x + */ + RAJA_INLINE + self_type operator+(self_type const &x) const + { + return self_type(_mm256_add_pd(m_value, x.m_value)); + } + + /*! + * @brief Add a vector to this vector + * @param x Vector to add to this register + * @return Value of (*this)+x + */ + RAJA_INLINE + self_type const &operator+=(self_type const &x) + { + m_value = _mm256_add_pd(m_value, x.m_value); + return *this; + } + + /*! + * @brief Subtract two vector registers + * @param x Vector to subctract from this register + * @return Value of (*this)+x + */ + RAJA_INLINE + self_type operator-(self_type const &x) const + { + return self_type(_mm256_sub_pd(m_value, x.m_value)); + } + + /*! + * @brief Subtract a vector from this vector + * @param x Vector to subtract from this register + * @return Value of (*this)+x + */ + RAJA_INLINE + self_type const &operator-=(self_type const &x) + { + m_value = _mm256_sub_pd(m_value, x.m_value); + return *this; + } + + /*! + * @brief Multiply two vector registers, element wise + * @param x Vector to subctract from this register + * @return Value of (*this)+x + */ + RAJA_INLINE + self_type operator*(self_type const &x) const + { + return self_type(_mm256_mul_pd(m_value, x.m_value)); + } + + /*! + * @brief Multiply a vector with this vector + * @param x Vector to multiple with this register + * @return Value of (*this)+x + */ + RAJA_INLINE + self_type const &operator*=(self_type const &x) + { + m_value = _mm256_mul_pd(m_value, x.m_value); + return *this; + } + + /*! + * @brief Divide two vector registers, element wise + * @param x Vector to subctract from this register + * @return Value of (*this)+x + */ + RAJA_INLINE + self_type operator/(self_type const &x) const + { + return self_type(_mm256_div_pd(m_value, x.m_value)); + } + + /*! + * @brief Divide this vector by another vector + * @param x Vector to divide by + * @return Value of (*this)+x + */ + RAJA_INLINE + self_type const &operator/=(self_type const &x) + { + m_value = _mm256_div_pd(m_value, x.m_value); + return *this; + } + + /*! + * @brief Sum the elements of this vector + * @return Sum of the values of the vectors scalar elements + */ + RAJA_INLINE + element_type sum() const + { + auto hsum = _mm256_hadd_pd(m_value, m_value); + return hsum[0] + m_value[2]; + } + + /*! + * @brief Dot product of two vectors + * @param x Other vector to dot with this vector + * @return Value of (*this) dot x + */ + RAJA_INLINE + element_type dot(self_type const &x) const + { + return self_type(_mm256_mul_pd(m_value, x.m_value)).sum(); + } + + /*! + * @brief Returns the largest element + * @return The largest scalar element in the register + */ + RAJA_INLINE + element_type max() const + { + // permute the first two lanes of the register + simd_type a = _mm256_shuffle_pd(m_value, m_value, 0x01); + + // take the minimum value of each lane + simd_type b = _mm256_max_pd(m_value, a); + + // now take the minimum of a lower and upper lane + return std::max(b[0], b[2]); + } + + /*! + * @brief Returns element-wise largest values + * @return Vector of the element-wise max values + */ + RAJA_INLINE + self_type vmax(self_type a) const + { + return self_type(_mm256_max_pd(m_value, a.m_value)); + } + + /*! + * @brief Returns the largest element + * @return The largest scalar element in the register + */ + RAJA_INLINE + element_type min() const + { + // permute the first two lanes of the register + // m_value = ABCD + // a = AACC + simd_type a = _mm256_shuffle_pd(m_value, m_value, 0x01); + + // take the minimum value of each lane + simd_type b = _mm256_min_pd(m_value, a); + + // now take the minimum of a lower and upper lane + return std::min(b[0], b[2]); + } + + /*! + * @brief Returns element-wise largest values + * @return Vector of the element-wise max values + */ + RAJA_INLINE + self_type vmin(self_type a) const + { + return self_type(_mm256_min_pd(m_value, a.m_value)); + } + }; + + + +} // namespace RAJA + + +#endif + +#endif //__AVX2__ diff --git a/include/RAJA/policy/simd/register/avx2_double4.hpp b/include/RAJA/policy/simd/register/avx2_double4.hpp new file mode 100644 index 0000000000..e8ad7a6fee --- /dev/null +++ b/include/RAJA/policy/simd/register/avx2_double4.hpp @@ -0,0 +1,360 @@ +/*! + ****************************************************************************** + * + * \file + * + * \brief RAJA header file defining a SIMD register abstraction. + * + ****************************************************************************** + */ + +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-19, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#ifdef __AVX2__ + +#ifndef RAJA_policy_simd_register_avx2_double4_HPP +#define RAJA_policy_simd_register_avx2_double4_HPP + +#include "RAJA/config.hpp" +#include "RAJA/util/macros.hpp" + +// Include SIMD intrinsics header file +#include +#include + + +namespace RAJA +{ + + + template<> + class Register{ + public: + using self_type = Register; + using element_type = double; + + static constexpr size_t s_num_elem = 4; + static constexpr size_t s_byte_width = s_num_elem*sizeof(double); + static constexpr size_t s_bit_width = s_byte_width*8; + + using simd_type = __m256d; + + private: + simd_type m_value; + + public: + + /*! + * @brief Default constructor, zeros register contents + */ + RAJA_INLINE + Register() : m_value(_mm256_setzero_pd()) { + } + + /*! + * @brief Copy constructor from underlying simd register + */ + RAJA_INLINE + explicit Register(simd_type const &c) : m_value(c) {} + + + /*! + * @brief Copy constructor + */ + RAJA_INLINE + Register(self_type const &c) : m_value(c.m_value) {} + + + /*! + * @brief Construct from scalar. + * Sets all elements to same value (broadcast). + */ + RAJA_INLINE + Register(element_type const &c) : m_value(_mm256_set1_pd(c)) {} + + /*! + * @brief Load constructor, assuming scalars are in consecutive memory + * locations. + */ + RAJA_INLINE + void load(element_type const *ptr){ + m_value = _mm256_loadu_pd(ptr); + } + + /*! + * @brief Strided load constructor, when scalars are located in memory + * locations ptr, ptr+stride, ptr+2*stride, etc. + * + * + * Note: this could be done with "gather" instructions if they are + * available. (like in avx2, but not in avx) + */ + RAJA_INLINE + void load(element_type const *ptr, size_t stride){ + m_value = _mm256_i64gather_pd(ptr, + _mm256_set_epi64x(3*stride, 2*stride, stride, 0), + sizeof(element_type)); + } + + + + /*! + * @brief Store operation, assuming scalars are in consecutive memory + * locations. + */ + RAJA_INLINE + void store(element_type *ptr) const{ + _mm256_storeu_pd(ptr, m_value); + } + + /*! + * @brief Strided store operation, where scalars are stored in memory + * locations ptr, ptr+stride, ptr+2*stride, etc. + * + * + * Note: this could be done with "scatter" instructions if they are + * available. + */ + RAJA_INLINE + void store(element_type *ptr, size_t stride) const{ + for(size_t i = 0;i < s_num_elem;++ i){ + ptr[i*stride] = m_value[i]; + } + } + + /*! + * @brief Get scalar value from vector register + * @param i Offset of scalar to get + * @return Returns scalar value at i + */ + template + constexpr + RAJA_INLINE + element_type operator[](IDX i) const + {return m_value[i];} + + + /*! + * @brief Set scalar value in vector register + * @param i Offset of scalar to set + * @param value Value of scalar to set + */ + template + RAJA_INLINE + void set(IDX i, element_type value) + {m_value[i] = value;} + + /*! + * @brief Set entire vector to a single scalar value + * @param value Value to set all vector elements to + */ + RAJA_INLINE + self_type const &operator=(element_type value) + { + m_value = _mm256_set1_pd(value); + return *this; + } + + /*! + * @brief Assign one register to antoher + * @param x Vector to copy + * @return Value of (*this) + */ + RAJA_INLINE + self_type const &operator=(self_type const &x) + { + m_value = x.m_value; + return *this; + } + + + /*! + * @brief Add two vector registers + * @param x Vector to add to this register + * @return Value of (*this)+x + */ + RAJA_INLINE + self_type operator+(self_type const &x) const + { + return self_type(_mm256_add_pd(m_value, x.m_value)); + } + + /*! + * @brief Add a vector to this vector + * @param x Vector to add to this register + * @return Value of (*this)+x + */ + RAJA_INLINE + self_type const &operator+=(self_type const &x) + { + m_value = _mm256_add_pd(m_value, x.m_value); + return *this; + } + + /*! + * @brief Subtract two vector registers + * @param x Vector to subctract from this register + * @return Value of (*this)+x + */ + RAJA_INLINE + self_type operator-(self_type const &x) const + { + return self_type(_mm256_sub_pd(m_value, x.m_value)); + } + + /*! + * @brief Subtract a vector from this vector + * @param x Vector to subtract from this register + * @return Value of (*this)+x + */ + RAJA_INLINE + self_type const &operator-=(self_type const &x) + { + m_value = _mm256_sub_pd(m_value, x.m_value); + return *this; + } + + /*! + * @brief Multiply two vector registers, element wise + * @param x Vector to subctract from this register + * @return Value of (*this)+x + */ + RAJA_INLINE + self_type operator*(self_type const &x) const + { + return self_type(_mm256_mul_pd(m_value, x.m_value)); + } + + /*! + * @brief Multiply a vector with this vector + * @param x Vector to multiple with this register + * @return Value of (*this)+x + */ + RAJA_INLINE + self_type const &operator*=(self_type const &x) + { + m_value = _mm256_mul_pd(m_value, x.m_value); + return *this; + } + + /*! + * @brief Divide two vector registers, element wise + * @param x Vector to subctract from this register + * @return Value of (*this)+x + */ + RAJA_INLINE + self_type operator/(self_type const &x) const + { + return self_type(_mm256_div_pd(m_value, x.m_value)); + } + + /*! + * @brief Divide this vector by another vector + * @param x Vector to divide by + * @return Value of (*this)+x + */ + RAJA_INLINE + self_type const &operator/=(self_type const &x) + { + m_value = _mm256_div_pd(m_value, x.m_value); + return *this; + } + + /*! + * @brief Sum the elements of this vector + * @return Sum of the values of the vectors scalar elements + */ + RAJA_INLINE + element_type sum() const + { + auto hsum = _mm256_hadd_pd(m_value, m_value); + return hsum[0] + hsum[2]; + } + + /*! + * @brief Dot product of two vectors + * @param x Other vector to dot with this vector + * @return Value of (*this) dot x + */ + RAJA_INLINE + element_type dot(self_type const &x) const + { + return self_type(_mm256_mul_pd(m_value, x.m_value)).sum(); + } + + /*! + * @brief Returns the largest element + * @return The largest scalar element in the register + */ + RAJA_INLINE + element_type max() const + { + // permute the first two and last two lanes of the register + simd_type a = _mm256_shuffle_pd(m_value, m_value, 0x05); + + // take the minimum value of each lane + // this gives us b=XXYY where + // X = min(a[0], a[1]) + // Y = min(a[2], a[3]) + simd_type b = _mm256_max_pd(m_value, a); + + // now take the minimum of a lower and upper lane + return std::max(b[0], b[2]); + } + + /*! + * @brief Returns element-wise largest values + * @return Vector of the element-wise max values + */ + RAJA_INLINE + self_type vmax(self_type a) const + { + return self_type(_mm256_max_pd(m_value, a.m_value)); + } + + /*! + * @brief Returns the largest element + * @return The largest scalar element in the register + */ + RAJA_INLINE + element_type min() const + { + // permute the first two and last two lanes of the register + // m_value = ABCD + // a = AACC + simd_type a = _mm256_shuffle_pd(m_value, m_value, 0x05); + + // take the minimum value of each lane + // this gives us b=XXYY where + // X = min(a[0], a[1]) + // Y = min(a[2], a[3]) + simd_type b = _mm256_min_pd(m_value, a); + + // now take the minimum of a lower and upper lane + return std::min(b[0], b[2]); + } + + /*! + * @brief Returns element-wise largest values + * @return Vector of the element-wise max values + */ + RAJA_INLINE + self_type vmin(self_type a) const + { + return self_type(_mm256_min_pd(m_value, a.m_value)); + } + }; + + + +} // namespace RAJA + + +#endif + +#endif //__AVX2__ diff --git a/include/RAJA/policy/simd/register/avx_double2.hpp b/include/RAJA/policy/simd/register/avx_double2.hpp index 5ada2cb5d7..99207076d9 100644 --- a/include/RAJA/policy/simd/register/avx_double2.hpp +++ b/include/RAJA/policy/simd/register/avx_double2.hpp @@ -3,7 +3,7 @@ * * \file * - * \brief RAJA header file defining vector operations. + * \brief RAJA header file defining a SIMD register abstraction. * ****************************************************************************** */ @@ -15,6 +15,8 @@ // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +#ifdef __AVX__ + #ifndef RAJA_policy_simd_register_avx_double2_HPP #define RAJA_policy_simd_register_avx_double2_HPP @@ -334,3 +336,6 @@ namespace RAJA #endif + +#endif //__AVX__ + diff --git a/include/RAJA/policy/simd/register/avx_double3.hpp b/include/RAJA/policy/simd/register/avx_double3.hpp index cd439105e2..59d1b515cc 100644 --- a/include/RAJA/policy/simd/register/avx_double3.hpp +++ b/include/RAJA/policy/simd/register/avx_double3.hpp @@ -3,7 +3,7 @@ * * \file * - * \brief RAJA header file defining vector operations. + * \brief RAJA header file defining a SIMD register abstraction. * ****************************************************************************** */ @@ -15,6 +15,8 @@ // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +#ifdef __AVX__ + #ifndef RAJA_policy_simd_register_avx_double3_HPP #define RAJA_policy_simd_register_avx_double3_HPP @@ -354,3 +356,6 @@ namespace RAJA #endif + +#endif //__AVX__ + diff --git a/include/RAJA/policy/simd/register/avx_double4.hpp b/include/RAJA/policy/simd/register/avx_double4.hpp index 3c9108eaee..742d0177b3 100644 --- a/include/RAJA/policy/simd/register/avx_double4.hpp +++ b/include/RAJA/policy/simd/register/avx_double4.hpp @@ -3,7 +3,7 @@ * * \file * - * \brief RAJA header file defining vector operations. + * \brief RAJA header file defining a SIMD register abstraction. * ****************************************************************************** */ @@ -15,8 +15,10 @@ // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -#ifndef RAJA_policy_simd_register_double4_HPP -#define RAJA_policy_simd_register_double4_HPP +#ifdef __AVX__ + +#ifndef RAJA_policy_simd_register_avx_double4_HPP +#define RAJA_policy_simd_register_avx_double4_HPP #include "RAJA/config.hpp" #include "RAJA/util/macros.hpp" @@ -355,3 +357,5 @@ namespace RAJA #endif + +#endif //__AVX__ diff --git a/test/unit/test-register.cpp b/test/unit/test-register.cpp index ec9eca7179..a2fbb993f1 100644 --- a/test/unit/test-register.cpp +++ b/test/unit/test-register.cpp @@ -12,29 +12,34 @@ #include "RAJA/RAJA.hpp" #include "gtest/gtest.h" -#if RAJA_SIMD_REGISTER_WIDTH == 0 + using RegisterTestTypes = ::testing::Types< - RAJA::Register, - RAJA::Register, - RAJA::Register>; -#else -using RegisterTestTypes = ::testing::Types< - RAJA::Register, - RAJA::Register, - RAJA::Register, - RAJA::Register, - RAJA::Register, - RAJA::Register, - RAJA::FixedVector, 27>, - RAJA::FixedVector, 27>, - RAJA::FixedVector, 27>, - RAJA::FixedVector, 27>, - RAJA::StreamVector, 4>, - RAJA::StreamVector, 8>, - RAJA::StreamVector, 12>, - RAJA::StreamVector, 16>>; +#ifdef __AVX__ + RAJA::Register, + RAJA::Register, + RAJA::Register, + RAJA::FixedVector, 27>, + RAJA::FixedVector, 27>, + RAJA::FixedVector, 27>, + RAJA::FixedVector, 27>, + RAJA::StreamVector, 4>, + RAJA::StreamVector, 8>, #endif +#ifdef __AVX2__ + RAJA::Register, + RAJA::Register, + RAJA::Register, + RAJA::FixedVector, 27>, + RAJA::FixedVector, 27>, + RAJA::FixedVector, 27>, + RAJA::FixedVector, 27>, + RAJA::StreamVector, 4>, + RAJA::StreamVector, 8>, +#endif + RAJA::Register, + RAJA::Register, + RAJA::Register>; template class RegisterTest : public ::testing::Test { diff --git a/test/unit/test-vector.cpp b/test/unit/test-vector.cpp index b36cc2cda3..944d35dbfa 100644 --- a/test/unit/test-vector.cpp +++ b/test/unit/test-vector.cpp @@ -13,17 +13,19 @@ #include "gtest/gtest.h" -#if RAJA_SIMD_REGISTER_WIDTH == 0 using VectorTestTypes = ::testing::Types< - RAJA::FixedVector, 4>>; -#else -using VectorTestTypes = ::testing::Types< - RAJA::FixedVector, 4>, - RAJA::FixedVector, 8>, - RAJA::StreamVector, 4>, - RAJA::StreamVector, 8>>; + +#ifdef __AVX__ + RAJA::FixedVector, 4>, + RAJA::FixedVector, 8>, + RAJA::StreamVector, 4>, + RAJA::StreamVector, 8>, #endif + RAJA::FixedVector, 3>, + RAJA::FixedVector, 5>, + RAJA::StreamVector, 1>, + RAJA::StreamVector, 3>>; template class VectorTest : public ::testing::Test From fd9a9f5eedd63883d46d75195ef615105b6c1fb9 Mon Sep 17 00:00:00 2001 From: "Adam J. Kunen" Date: Mon, 11 Nov 2019 09:24:41 -0800 Subject: [PATCH 015/593] Moved VectorIndex into pattern/vector --- include/RAJA/pattern/vector.hpp | 1 + include/RAJA/pattern/vector/VectorIndex.hpp | 63 +++++++++++++++++++++ include/RAJA/policy/simd/policy.hpp | 35 ------------ include/RAJA/util/View.hpp | 8 +++ 4 files changed, 72 insertions(+), 35 deletions(-) create mode 100644 include/RAJA/pattern/vector/VectorIndex.hpp diff --git a/include/RAJA/pattern/vector.hpp b/include/RAJA/pattern/vector.hpp index e2c9913d38..83e5fc6a78 100644 --- a/include/RAJA/pattern/vector.hpp +++ b/include/RAJA/pattern/vector.hpp @@ -19,6 +19,7 @@ #define RAJA_pattern_vector_HPP #include "RAJA/pattern/vector/Vector.hpp" +#include "RAJA/pattern/vector/VectorIndex.hpp" #include "RAJA/pattern/vector/VectorRef.hpp" diff --git a/include/RAJA/pattern/vector/VectorIndex.hpp b/include/RAJA/pattern/vector/VectorIndex.hpp new file mode 100644 index 0000000000..02b1676d5b --- /dev/null +++ b/include/RAJA/pattern/vector/VectorIndex.hpp @@ -0,0 +1,63 @@ +/*! + ****************************************************************************** + * + * \file + * + * \brief RAJA header file defining SIMD/SIMT register operations. + * + ****************************************************************************** + */ + +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-19, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#ifndef RAJA_pattern_vector_vectorindex_HPP +#define RAJA_pattern_vector_vectorindex_HPP + +#include "RAJA/config.hpp" + +#include "RAJA/util/macros.hpp" + + +namespace RAJA +{ + + template + class VectorIndex { + public: + using index_type = IDX; + using vector_type = VECTOR_TYPE; + + RAJA_INLINE + constexpr + VectorIndex() : m_index(0), m_length(vector_type::s_num_elem) {} + + RAJA_INLINE + constexpr + VectorIndex(index_type value, size_t length) : m_index(value), m_length(length) {} + + RAJA_INLINE + constexpr + index_type operator*() const { + return m_index; + } + + RAJA_INLINE + constexpr + size_t size() const { + return m_length; + } + + private: + index_type m_index; + size_t m_length; + }; + +} // namespace RAJA + + +#endif diff --git a/include/RAJA/policy/simd/policy.hpp b/include/RAJA/policy/simd/policy.hpp index 924e8107ec..b1837adcc2 100644 --- a/include/RAJA/policy/simd/policy.hpp +++ b/include/RAJA/policy/simd/policy.hpp @@ -57,47 +57,12 @@ struct simd_vector_exec : make_policy_pattern_launch_platform_t -class VectorIndex { - public: - using index_type = IDX; - using vector_type = VECTOR_TYPE; - - RAJA_INLINE - constexpr - VectorIndex() : m_index(0), m_length(vector_type::s_num_elem) {} - - RAJA_INLINE - constexpr - VectorIndex(index_type value, size_t length) : m_index(value), m_length(length) {} - - RAJA_INLINE - constexpr - index_type operator*() const { - return m_index; - } - - RAJA_INLINE - constexpr - size_t size() const { - return m_length; - } - - private: - index_type m_index; - size_t m_length; -}; - - - - } // end of namespace simd } // end of namespace policy using policy::simd::simd_exec; using policy::simd::simd_vector_exec; -using policy::simd::VectorIndex; } // end of namespace RAJA diff --git a/include/RAJA/util/View.hpp b/include/RAJA/util/View.hpp index 41329d8ac6..bf97aa2f57 100644 --- a/include/RAJA/util/View.hpp +++ b/include/RAJA/util/View.hpp @@ -30,6 +30,14 @@ namespace RAJA { +namespace internal +{ + + // Helper to unpack VectorIndex + +} // namespace internal + + //Helpers to convert //layouts -> OffsetLayouts //Typedlayouts -> TypedOffsetLayouts From 088aa590172ffd3aa89340d5d39e2469481a22c8 Mon Sep 17 00:00:00 2001 From: "Adam J. Kunen" Date: Mon, 11 Nov 2019 16:00:38 -0800 Subject: [PATCH 016/593] First pass at making arbitrary views work with VectorIndex --- include/RAJA/pattern/vector/VectorIndex.hpp | 6 +- include/RAJA/util/View.hpp | 158 ++++++++++++++++++-- 2 files changed, 150 insertions(+), 14 deletions(-) diff --git a/include/RAJA/pattern/vector/VectorIndex.hpp b/include/RAJA/pattern/vector/VectorIndex.hpp index 02b1676d5b..d8a59acff6 100644 --- a/include/RAJA/pattern/vector/VectorIndex.hpp +++ b/include/RAJA/pattern/vector/VectorIndex.hpp @@ -33,20 +33,24 @@ namespace RAJA using vector_type = VECTOR_TYPE; RAJA_INLINE + RAJA_HOST_DEVICE constexpr VectorIndex() : m_index(0), m_length(vector_type::s_num_elem) {} RAJA_INLINE + RAJA_HOST_DEVICE constexpr VectorIndex(index_type value, size_t length) : m_index(value), m_length(length) {} RAJA_INLINE + RAJA_HOST_DEVICE constexpr - index_type operator*() const { + index_type const &operator*() const { return m_index; } RAJA_INLINE + RAJA_HOST_DEVICE constexpr size_t size() const { return m_length; diff --git a/include/RAJA/util/View.hpp b/include/RAJA/util/View.hpp index bf97aa2f57..32dee39861 100644 --- a/include/RAJA/util/View.hpp +++ b/include/RAJA/util/View.hpp @@ -23,6 +23,7 @@ #include "RAJA/config.hpp" #include "RAJA/pattern/atomic.hpp" +#include "RAJA/pattern/vector.hpp" #include "RAJA/util/Layout.hpp" #include "RAJA/util/OffsetLayout.hpp" @@ -33,7 +34,136 @@ namespace RAJA namespace internal { + // Helper that strips the Vector type from an argument + template + struct StripVectorIndex { + using arg_type = ARG; + using vector_type = RAJA::simd_scalar_register; + static constexpr bool s_is_vector = false; + + + RAJA_INLINE + RAJA_HOST_DEVICE + static + constexpr + arg_type const &get(arg_type const &arg){ + return arg; + } + }; + + template + struct StripVectorIndex> { + using arg_type = IDX; + using vector_type = VECTOR_TYPE; + static constexpr bool s_is_vector = true; + + RAJA_INLINE + RAJA_HOST_DEVICE + static + constexpr + arg_type const &get(VectorIndex const &arg){ + return *arg; + } + }; + + template + RAJA_INLINE + RAJA_HOST_DEVICE + constexpr + auto stripVectorIndex(ARG const &arg) -> + typename StripVectorIndex::arg_type const & + { + return StripVectorIndex::get(arg); + } + + template + struct ExtractVectorArg; + + template + struct ExtractVectorArg{ + using strip_index_t = StripVectorIndex; + using next_t = ExtractVectorArg; + + static constexpr camp::idx_t s_num_vector_args = + (strip_index_t::s_is_vector ? 1 : 0) + next_t::s_num_vector_args; + + static constexpr camp::idx_t s_vector_arg_idx = + (strip_index_t::s_is_vector ? I : next_t::s_vector_arg_idx); + + using vector_type = + typename std::conditional::type; + }; + + // Termination case + template + struct ExtractVectorArg{ + static constexpr camp::idx_t s_num_vector_args = 0; + static constexpr camp::idx_t s_vector_arg_idx = -1; + using vector_type = RAJA::simd_scalar_register; + }; + // Helper to unpack VectorIndex + template + struct ViewVectorArgsHelper; + + template + struct ViewVectorArgsHelper { + + // Count how many VectorIndex arguments there are + static constexpr size_t s_num_vector_args = ExtractType::s_num_vector_args; + + // Make sure we don't have conflicting arguments + static_assert(s_num_vector_args < 2, "View only supports a single VectorIndex at a time"); + + + // We cannot compute this yet. + // TODO: figure out how this might be computed... + static constexpr bool s_is_stride_one = false; + + + // Compute a Vector type + using vector_type = typename ExtractType::vector_type; + + using type = VectorRef; + + template + RAJA_INLINE + RAJA_HOST_DEVICE + static + type createReturn(IdxLin lin_index, Args args, PointerType pointer, IdxLin stride){ + auto arg = camp::get(args); + return type(lin_index, arg.size(), pointer, stride); + } + }; + + template + struct ViewVectorArgsHelper { + + // We cannot compute this yet. + // TODO: figure out how this might be computed... + static constexpr bool s_is_stride_one = false; + + + using type = ValueType&; + + template + RAJA_INLINE + RAJA_HOST_DEVICE + static + type createReturn(IdxLin lin_index, Args , PointerType pointer, IdxLin ){ + return pointer[lin_index]; + } + }; + + + + template + using ViewVectorHelper = ViewVectorArgsHelper, ExtractVectorArg<0, ARGS...>::s_num_vector_args >= 1>; + + } // namespace internal @@ -115,27 +245,29 @@ struct View { // making this specifically typed would require unpacking the layout, // this is easier to maintain template - RAJA_HOST_DEVICE RAJA_INLINE value_type &operator()(Args... args) const + RAJA_HOST_DEVICE RAJA_INLINE + auto operator()(Args... args) const -> + typename internal::ViewVectorHelper::type { - auto idx = stripIndexType(layout(args...)); - return data[idx]; + using helper_t = internal::ViewVectorHelper; + + auto idx = stripIndexType(layout(internal::stripVectorIndex(args)...)); + return helper_t::createReturn(idx, camp::make_tuple(args...), data, 1); } - // making this specifically typed would require unpacking the layout, - // this is easier to maintain - //RAJA::StreamRegisterIndex - //RAJA::VectorRef, double*, true> - template + + template RAJA_HOST_DEVICE RAJA_INLINE - VectorRef - operator[](RAJA::VectorIndex arg) const + auto operator[](Arg arg) const -> + typename internal::ViewVectorHelper::type { + using helper_t = internal::ViewVectorHelper; + // Compute the linear index - linear_index_type idx = stripIndexType(layout(*arg)); + linear_index_type idx = stripIndexType(layout(internal::stripVectorIndex(arg))); // Stuff it back into the index - using ref_type = VectorRef; - return ref_type(idx, arg.size(), data, 1); + return helper_t::createReturn(idx, camp::make_tuple(arg), data, 1); } }; From 4ec1bdd2ad7ae54fed308917789d7b51d080dd7c Mon Sep 17 00:00:00 2001 From: "Adam J. Kunen" Date: Tue, 12 Nov 2019 12:44:02 -0800 Subject: [PATCH 017/593] View is now simd enabled, testing works with inner and outer loop simd vectorization --- include/RAJA/pattern/vector/VectorRef.hpp | 6 ++ test/unit/test-vector.cpp | 96 ++++++++++++++++++++++- 2 files changed, 100 insertions(+), 2 deletions(-) diff --git a/include/RAJA/pattern/vector/VectorRef.hpp b/include/RAJA/pattern/vector/VectorRef.hpp index 1b8c7cf221..f9d6972d9c 100644 --- a/include/RAJA/pattern/vector/VectorRef.hpp +++ b/include/RAJA/pattern/vector/VectorRef.hpp @@ -88,6 +88,12 @@ namespace RAJA {} + RAJA_INLINE + element_type *get_pointer() const + { + return &m_data[m_linear_index]; + } + /*! * @brief Set entire vector to a single scalar value * @param value Value to set all vector elements to diff --git a/test/unit/test-vector.cpp b/test/unit/test-vector.cpp index 944d35dbfa..66386c43e8 100644 --- a/test/unit/test-vector.cpp +++ b/test/unit/test-vector.cpp @@ -54,7 +54,7 @@ TYPED_TEST_P(VectorTest, ForallVectorRef1d) using element_t = typename vector_t::element_type; - size_t N = 8000; + size_t N = 100*vector_t::s_num_elem; // If we are not using fixed vectors, add some random number of elements // to the array to test some postamble code generation. if(!vector_t::s_is_fixed){ @@ -93,6 +93,98 @@ TYPED_TEST_P(VectorTest, ForallVectorRef1d) } -REGISTER_TYPED_TEST_CASE_P(VectorTest, ForallVectorRef1d); +TYPED_TEST_P(VectorTest, ForallVectorRef2d) +{ + using vector_t = TypeParam; + + using element_t = typename vector_t::element_type; + + + size_t N = 128, M = 16*vector_t::s_num_elem; + // If we are not using fixed vectors, add some random number of elements + // to the array to test some postamble code generation. + if(!vector_t::s_is_fixed){ + N += (10*drand48()); + M += (10*drand48()); + } + + element_t *A = new element_t[N*M]; + element_t *B = new element_t[N*M]; + element_t *C = new element_t[N*M]; + for(size_t i = 0;i < N*M; ++ i){ + A[i] = (element_t)(drand48()*1000.0); + B[i] = (element_t)(drand48()*1000.0); + C[i] = 0.0; + } + + RAJA::View> X(A, N, M); + RAJA::View> Y(B, N, M); + RAJA::View> Z(C, N, M); + + using policy_t = RAJA::simd_vector_exec; + + ASSERT_EQ(A, X(0, RAJA::VectorIndex(0, 1)).get_pointer()); + ASSERT_EQ(A+M, X(1, RAJA::VectorIndex(0, 1)).get_pointer()); + ASSERT_EQ(A+1, X(0, RAJA::VectorIndex(1, 1)).get_pointer()); + + +// RAJA::KernelPolicy< +// RAJA::statement::For<0, RAJA::loop_exec, +// RAJA::statement::For<1, RAJA::simd_vector_exec, +// RAJA::statement::Lambda<0> +// > +// > +// >; + + +// RAJA::kernel( +// RAJA::make_tuple(RAJA::TypedRangeSegment(0, N), +// RAJA::TypedRangeSegment(0, M)), +// +// [=](size_t i, RAJA::VectorIndex j) +// { +// Z(i,j) = 3+(X(i,j)*(5/Y(i,j)))+9; +// }); + + RAJA::forall(RAJA::TypedRangeSegment(0, N), + [=](size_t i){ + + RAJA::forall(RAJA::TypedRangeSegment(0, M), + [=](RAJA::VectorIndex j){ + + Z(i,j) = 3+(X(i,j)*(5/Y(i,j)))+9; + }); + + }); + + for(size_t i = 0;i < N;i ++){ + ASSERT_DOUBLE_EQ(3+(A[i]*(5/B[i]))+9, C[i]); + } + + + + RAJA::forall(RAJA::TypedRangeSegment(0, N), + [=](RAJA::VectorIndex i){ + + RAJA::forall(RAJA::TypedRangeSegment(0, M), + [=](size_t j){ + + Z(i,j) = 3+(X(i,j)*(5/Y(i,j)))+9; + }); + + }); + + for(size_t i = 0;i < N;i ++){ + ASSERT_DOUBLE_EQ(3+(A[i]*(5/B[i]))+9, C[i]); + } + + + delete[] A; + delete[] B; + delete[] C; +} + + +REGISTER_TYPED_TEST_CASE_P(VectorTest, ForallVectorRef1d, ForallVectorRef2d); INSTANTIATE_TYPED_TEST_CASE_P(SIMD, VectorTest, VectorTestTypes); From 7be3cd4d80d6926b601aadc193334a74ca64d273 Mon Sep 17 00:00:00 2001 From: "Adam J. Kunen" Date: Tue, 12 Nov 2019 12:48:23 -0800 Subject: [PATCH 018/593] Added AVX2 tests to test-vector --- test/unit/test-vector.cpp | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/test/unit/test-vector.cpp b/test/unit/test-vector.cpp index 66386c43e8..d0bdec4b6c 100644 --- a/test/unit/test-vector.cpp +++ b/test/unit/test-vector.cpp @@ -22,6 +22,13 @@ using VectorTestTypes = ::testing::Types< RAJA::StreamVector, 8>, #endif +#ifdef __AVX2__ + RAJA::FixedVector, 27>, + RAJA::FixedVector, 27>, + RAJA::StreamVector, 4>, + RAJA::StreamVector, 8>, +#endif + RAJA::FixedVector, 3>, RAJA::FixedVector, 5>, RAJA::StreamVector, 1>, @@ -146,6 +153,11 @@ TYPED_TEST_P(VectorTest, ForallVectorRef2d) // Z(i,j) = 3+(X(i,j)*(5/Y(i,j)))+9; // }); + + // + // Test inner loop SIMD + // + RAJA::forall(RAJA::TypedRangeSegment(0, N), [=](size_t i){ @@ -162,6 +174,9 @@ TYPED_TEST_P(VectorTest, ForallVectorRef2d) } + // + // Test outer loop SIMD + // RAJA::forall(RAJA::TypedRangeSegment(0, N), [=](RAJA::VectorIndex i){ From 8b3035ea24761051fda6c40987c19c08e84de530 Mon Sep 17 00:00:00 2001 From: Tom Scogland Date: Thu, 21 Nov 2019 07:44:41 -0800 Subject: [PATCH 019/593] require c++14 --- cmake/SetupCompilers.cmake | 7 ++++--- include/RAJA/config.hpp.in | 5 +++++ 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/cmake/SetupCompilers.cmake b/cmake/SetupCompilers.cmake index 45ddcc6eea..e9813e001d 100644 --- a/cmake/SetupCompilers.cmake +++ b/cmake/SetupCompilers.cmake @@ -10,14 +10,15 @@ set(COMPILERS_KNOWN_TO_CMAKE33 AppleClang Clang GNU MSVC) include(CheckCXXCompilerFlag) if(RAJA_CXX_STANDARD_FLAG MATCHES default) if("cxx_std_17" IN_LIST CMAKE_CXX_KNOWN_FEATURES) - #TODO set BLT_CXX_STANDARD + #TODO set BLT_CXX_STD + #NOTE @trws: did not do this as it does not behave correctly set(CMAKE_CXX_STANDARD 17) elseif("cxx_std_14" IN_LIST CMAKE_CXX_KNOWN_FEATURES) set(CMAKE_CXX_STANDARD 14) elseif("${CMAKE_CXX_COMPILER_ID}" IN_LIST COMPILERS_KNOWN_TO_CMAKE33) set(CMAKE_CXX_STANDARD 14) else() #cmake has no idea what to do, do it ourselves... - foreach(flag_var "-std=c++17" "-std=c++1z" "-std=c++14" "-std=c++1y" "-std=c++11") + foreach(flag_var "-std=c++17" "-std=c++1z" "-std=c++14" "-std=c++1y") CHECK_CXX_COMPILER_FLAG(${flag_var} COMPILER_SUPPORTS_${flag_var}) if(COMPILER_SUPPORTS_${flag_var}) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${flag_var}") @@ -69,7 +70,7 @@ if ( MSVC ) endif() if (ENABLE_CUDA) - set(CMAKE_CUDA_STANDARD 11) + set(CMAKE_CUDA_STANDARD 14) set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -restrict -arch ${CUDA_ARCH} --expt-extended-lambda --expt-relaxed-constexpr") if (NOT RAJA_HOST_CONFIG_LOADED) diff --git a/include/RAJA/config.hpp.in b/include/RAJA/config.hpp.in index 602758f2b7..a965fb4471 100644 --- a/include/RAJA/config.hpp.in +++ b/include/RAJA/config.hpp.in @@ -31,6 +31,11 @@ #ifndef RAJA_config_HPP #define RAJA_config_HPP +static_assert(__cplusplus >= 201402L, + "RAJA requires at least basic C++14 to operate correctly, your " + "compiler and/or standard library does not claim support for " + "C++14"); + /*! ****************************************************************************** * From d213223772c04e04b4c2b7dae19faa4d7c01f31f Mon Sep 17 00:00:00 2001 From: Tom Scogland Date: Thu, 21 Nov 2019 10:20:16 -0800 Subject: [PATCH 020/593] fix cmake, add 17 folds, remove trailing returns --- cmake/SetupCompilers.cmake | 8 +- include/RAJA/internal/LegacyCompatibility.hpp | 187 +++--------------- include/RAJA/util/Layout.hpp | 41 ++-- include/RAJA/util/StaticLayout.hpp | 27 ++- 4 files changed, 69 insertions(+), 194 deletions(-) diff --git a/cmake/SetupCompilers.cmake b/cmake/SetupCompilers.cmake index e9813e001d..9593bccb94 100644 --- a/cmake/SetupCompilers.cmake +++ b/cmake/SetupCompilers.cmake @@ -9,14 +9,14 @@ set(COMPILERS_KNOWN_TO_CMAKE33 AppleClang Clang GNU MSVC) include(CheckCXXCompilerFlag) if(RAJA_CXX_STANDARD_FLAG MATCHES default) - if("cxx_std_17" IN_LIST CMAKE_CXX_KNOWN_FEATURES) + if("cxx_std_17" IN_LIST CMAKE_CXX_COMPILE_FEATURES) #TODO set BLT_CXX_STD #NOTE @trws: did not do this as it does not behave correctly set(CMAKE_CXX_STANDARD 17) - elseif("cxx_std_14" IN_LIST CMAKE_CXX_KNOWN_FEATURES) - set(CMAKE_CXX_STANDARD 14) - elseif("${CMAKE_CXX_COMPILER_ID}" IN_LIST COMPILERS_KNOWN_TO_CMAKE33) + set(CMAKE_CXX_STANDARD_REQUIRED ON) + elseif("cxx_std_14" IN_LIST CMAKE_CXX_COMPILE_FEATURES) set(CMAKE_CXX_STANDARD 14) + set(CMAKE_CXX_STANDARD_REQUIRED ON) else() #cmake has no idea what to do, do it ourselves... foreach(flag_var "-std=c++17" "-std=c++1z" "-std=c++14" "-std=c++1y") CHECK_CXX_COMPILER_FLAG(${flag_var} COMPILER_SUPPORTS_${flag_var}) diff --git a/include/RAJA/internal/LegacyCompatibility.hpp b/include/RAJA/internal/LegacyCompatibility.hpp index d08d675699..c2ea529605 100644 --- a/include/RAJA/internal/LegacyCompatibility.hpp +++ b/include/RAJA/internal/LegacyCompatibility.hpp @@ -20,6 +20,7 @@ #include "RAJA/config.hpp" +#include #include #include #include @@ -30,28 +31,18 @@ #include "RAJA/util/macros.hpp" +#if (!defined(CAMP_HAS_FOLD_EXPRESSIONS)) && \ + defined(__cpp_fold_expressions) && __cpp_fold_expressions >= 201603 +#define CAMP_HAS_FOLD_EXPRESSIONS 1 +#endif + + #if (!defined(__INTEL_COMPILER)) && (!defined(RAJA_COMPILER_MSVC)) static_assert(__cplusplus >= 201103L, "C++ standards below 2011 are not " "supported" RAJA_STRINGIFY_HELPER(__cplusplus)); #endif -#if __cplusplus > 201400L -#define RAJA_CXX14_CONSTEXPR constexpr -#else -#define RAJA_CXX14_CONSTEXPR -#endif - -// #if defined(RAJA_USE_CUDA) -// #include -// namespace VarOps { -// using thrust::tuple; -// using thrust::tuple_element; -// using thrust::get; -// using thrust::tuple_size; -// using thrust::make_tuple; -// } -// #else #include #include namespace VarOps @@ -73,36 +64,11 @@ namespace VarOps // Forward // FoldL -template -struct foldl_impl; - -template -struct foldl_impl { - using Ret = Arg1; -}; - -template -struct foldl_impl { - using Ret = typename std::result_of::type; -}; - -template -struct foldl_impl { - using Ret = typename foldl_impl< - Op, - typename std::result_of::type, - Arg3)>::type, - Rest...>::Ret; -}; template RAJA_HOST_DEVICE RAJA_INLINE constexpr auto foldl( Op&& RAJA_UNUSED_ARG(operation), - Arg1&& arg) -> typename foldl_impl::Ret + Arg1&& arg) { return camp::forward(arg); } @@ -110,8 +76,7 @@ RAJA_HOST_DEVICE RAJA_INLINE constexpr auto foldl( template RAJA_HOST_DEVICE RAJA_INLINE constexpr auto foldl(Op&& operation, Arg1&& arg1, - Arg2&& arg2) -> - typename foldl_impl::Ret + Arg2&& arg2) { return camp::forward(operation)(camp::forward(arg1), camp::forward(arg2)); @@ -126,8 +91,7 @@ RAJA_HOST_DEVICE RAJA_INLINE constexpr auto foldl(Op&& operation, Arg1&& arg1, Arg2&& arg2, Arg3&& arg3, - Rest&&... rest) -> - typename foldl_impl::Ret + Rest&&... rest) { return foldl(camp::forward(operation), camp::forward(operation)( @@ -142,42 +106,38 @@ RAJA_HOST_DEVICE RAJA_INLINE constexpr auto foldl(Op&& operation, template RAJA_HOST_DEVICE RAJA_INLINE constexpr Result sum(Args... args) { +#ifdef CAMP_HAS_FOLD_EXPRESSIONS + return (... + args); +#else return foldl(RAJA::operators::plus(), args...); +#endif } template RAJA_HOST_DEVICE RAJA_INLINE constexpr Result max(Args... args) { - return foldl(RAJA::operators::maximum(), args...); + return std::max({args...}); } template RAJA_HOST_DEVICE RAJA_INLINE constexpr Result min(Args... args) { - return foldl(RAJA::operators::minimum(), args...); + return std::min({args...}); } -// template -// struct product_first_n; -// -// template -// struct product_first_n{ -// static Result value = 1; -// template -// constexpr product_first_n(Args...args) : value{1} { } -// }; -// -// template -// struct product_first_n{ -// static Result value = product_first_n(args...)::value; -// template -// constexpr product_first_n(FirstArg arg1, Args...args) -// : value() { } -// }; +template +RAJA_HOST_DEVICE RAJA_INLINE constexpr Result product(Args... args) +{ +#ifdef CAMP_HAS_FOLD_EXPRESSIONS + return (... * args); +#else + return foldl(RAJA::operators::multiplies(), args...); +#endif +} template