From 4c92a0e49b8ea55dbee76c516b7103e339725621 Mon Sep 17 00:00:00 2001 From: Juan Zuniga-Anaya <50754207+jzuniga-amd@users.noreply.github.com> Date: Thu, 7 Nov 2024 10:08:05 -0700 Subject: [PATCH] Add LASR API (#845) * adds LASR definition and documentation * adds empty API * add unit test * implement lasr routine * optimize lasr routine * clean/simplify lasr code * address review comments * changelog * changelog update * address review comments --- CHANGELOG.md | 5 + clients/CMakeLists.txt | 1 + clients/benchmarks/client.cpp | 7 + clients/common/auxiliary/testing_lasr.cpp | 32 ++ clients/common/auxiliary/testing_lasr.hpp | 427 ++++++++++++++++ clients/common/misc/lapack_host_reference.cpp | 91 ++++ clients/common/misc/lapack_host_reference.hpp | 11 + clients/common/misc/rocsolver.hpp | 58 +++ clients/common/misc/rocsolver_arguments.hpp | 11 + clients/common/misc/rocsolver_dispatcher.hpp | 5 + clients/gtest/CMakeLists.txt | 2 + clients/gtest/auxiliary/lasr_gtest.cpp | 141 ++++++ common/include/rocsolver_datatype2string.hpp | 22 + docs/reference/auxiliary.rst | 25 + docs/reference/intro.rst | 5 + docs/reference/types.rst | 4 + .../include/rocsolver/rocsolver-extra-types.h | 19 +- .../include/rocsolver/rocsolver-functions.h | 123 +++++ library/src/CMakeLists.txt | 2 + library/src/auxiliary/rocauxiliary_lasr.cpp | 141 ++++++ library/src/auxiliary/rocauxiliary_lasr.hpp | 479 ++++++++++++++++++ library/src/include/lib_host_helpers.hpp | 5 +- library/src/include/rocsolver_logvalue.hpp | 10 + 23 files changed, 1619 insertions(+), 7 deletions(-) create mode 100644 clients/common/auxiliary/testing_lasr.cpp create mode 100644 clients/common/auxiliary/testing_lasr.hpp create mode 100644 clients/gtest/auxiliary/lasr_gtest.cpp create mode 100644 library/src/auxiliary/rocauxiliary_lasr.cpp create mode 100644 library/src/auxiliary/rocauxiliary_lasr.hpp diff --git a/CHANGELOG.md b/CHANGELOG.md index 013478027..45187e998 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,7 +3,12 @@ Full documentation for rocSOLVER is available at the [rocSOLVER documentation](https://rocm.docs.amd.com/projects/rocSOLVER/en/latest/index.html). ## (Unreleased) rocSOLVER + ### Added + +* Application of a sequence of plane rotations to a given matrix + - LASR + ### Changed ### Removed ### Optimized diff --git a/clients/CMakeLists.txt b/clients/CMakeLists.txt index 21278c082..386330bf4 100755 --- a/clients/CMakeLists.txt +++ b/clients/CMakeLists.txt @@ -75,6 +75,7 @@ if(BUILD_CLIENTS_BENCHMARKS OR BUILD_CLIENTS_TESTS) common/auxiliary/testing_larf.cpp common/auxiliary/testing_larft.cpp common/auxiliary/testing_larfb.cpp + common/auxiliary/testing_lasr.cpp common/auxiliary/testing_latrd.cpp common/auxiliary/testing_labrd.cpp common/auxiliary/testing_lauum.cpp diff --git a/clients/benchmarks/client.cpp b/clients/benchmarks/client.cpp index d9e28a9c2..82669d341 100644 --- a/clients/benchmarks/client.cpp +++ b/clients/benchmarks/client.cpp @@ -565,6 +565,12 @@ try " The order in which a series of transformations are applied.\n" " ") + ("pivot", + value()->default_value('V'), + "V = variable, T = top, B = bottom.\n" + " Defines the planes on which a sequence of rotations is applied.\n" + " ") + ("evect", value()->default_value('N'), "N = none, V = compute eigenvectors of the matrix,\n" @@ -644,6 +650,7 @@ try argus.validate_fill("uplo"); argus.validate_diag("diag"); argus.validate_direct("direct"); + argus.validate_pivot("pivot"); argus.validate_storev("storev"); argus.validate_svect("svect"); argus.validate_svect("left_svect"); diff --git a/clients/common/auxiliary/testing_lasr.cpp b/clients/common/auxiliary/testing_lasr.cpp new file mode 100644 index 000000000..b00cc2d02 --- /dev/null +++ b/clients/common/auxiliary/testing_lasr.cpp @@ -0,0 +1,32 @@ +/* ************************************************************************** + * Copyright (C) 2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * *************************************************************************/ + +#include "testing_lasr.hpp" + +#define TESTING_LASR(...) template void testing_lasr<__VA_ARGS__>(Arguments&); + +INSTANTIATE(TESTING_LASR, FOREACH_SCALAR_TYPE, APPLY_STAMP) diff --git a/clients/common/auxiliary/testing_lasr.hpp b/clients/common/auxiliary/testing_lasr.hpp new file mode 100644 index 000000000..b0b818979 --- /dev/null +++ b/clients/common/auxiliary/testing_lasr.hpp @@ -0,0 +1,427 @@ +/* ************************************************************************** + * Copyright (C) 2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * *************************************************************************/ + +#pragma once + +#include "common/misc/client_util.hpp" +#include "common/misc/clientcommon.hpp" +#include "common/misc/lapack_host_reference.hpp" +#include "common/misc/norm.hpp" +#include "common/misc/rocsolver.hpp" +#include "common/misc/rocsolver_arguments.hpp" +#include "common/misc/rocsolver_test.hpp" + +template +void lasr_checkBadArgs(const rocblas_handle handle, + const rocblas_side side, + const rocblas_pivot pivot, + const rocblas_direct direct, + const rocblas_int m, + const rocblas_int n, + S dC, + S dS, + T dA, + const rocblas_int lda) +{ + // handle + EXPECT_ROCBLAS_STATUS(rocsolver_lasr(nullptr, side, pivot, direct, m, n, dC, dS, dA, lda), + rocblas_status_invalid_handle); + + // values + EXPECT_ROCBLAS_STATUS( + rocsolver_lasr(handle, rocblas_side(0), pivot, direct, m, n, dC, dS, dA, lda), + rocblas_status_invalid_value); + EXPECT_ROCBLAS_STATUS( + rocsolver_lasr(handle, side, rocblas_pivot(0), direct, m, n, dC, dS, dA, lda), + rocblas_status_invalid_value); + EXPECT_ROCBLAS_STATUS( + rocsolver_lasr(handle, side, pivot, rocblas_direct(0), m, n, dC, dS, dA, lda), + rocblas_status_invalid_value); + + // pointers + EXPECT_ROCBLAS_STATUS(rocsolver_lasr(handle, side, pivot, direct, m, n, (S) nullptr, dS, dA, lda), + rocblas_status_invalid_pointer); + EXPECT_ROCBLAS_STATUS(rocsolver_lasr(handle, side, pivot, direct, m, n, dC, (S) nullptr, dA, lda), + rocblas_status_invalid_pointer); + EXPECT_ROCBLAS_STATUS(rocsolver_lasr(handle, side, pivot, direct, m, n, dC, dS, (T) nullptr, lda), + rocblas_status_invalid_pointer); + + // quick return with invalid pointers + EXPECT_ROCBLAS_STATUS(rocsolver_lasr(handle, rocblas_side_left, pivot, direct, 0, n, + (S) nullptr, (S) nullptr, (T) nullptr, lda), + rocblas_status_success); + EXPECT_ROCBLAS_STATUS( + rocsolver_lasr(handle, rocblas_side_right, pivot, direct, 0, n, dC, dS, (T) nullptr, lda), + rocblas_status_success); + EXPECT_ROCBLAS_STATUS(rocsolver_lasr(handle, rocblas_side_right, pivot, direct, m, 0, + (S) nullptr, (S) nullptr, (T) nullptr, lda), + rocblas_status_success); + EXPECT_ROCBLAS_STATUS( + rocsolver_lasr(handle, rocblas_side_left, pivot, direct, m, 0, dC, dS, (T) nullptr, lda), + rocblas_status_success); + EXPECT_ROCBLAS_STATUS(rocsolver_lasr(handle, rocblas_side_left, pivot, direct, 1, n, + (S) nullptr, (S) nullptr, dA, lda), + rocblas_status_success); + EXPECT_ROCBLAS_STATUS(rocsolver_lasr(handle, rocblas_side_right, pivot, direct, m, 1, + (S) nullptr, (S) nullptr, dA, lda), + rocblas_status_success); +} + +template +void testing_lasr_bad_arg() +{ + using S = decltype(std::real(T{})); + + // safe arguments + rocblas_local_handle handle; + rocblas_side side = rocblas_side_left; + rocblas_pivot pivot = rocblas_pivot_variable; + rocblas_direct direct = rocblas_forward_direction; + rocblas_int m = 2; + rocblas_int n = 2; + rocblas_int lda = 2; + + // memory allocation + device_strided_batch_vector dC(1, 1, 1, 1); + device_strided_batch_vector dS(1, 1, 1, 1); + device_strided_batch_vector dA(1, 1, 1, 1); + CHECK_HIP_ERROR(dC.memcheck()); + CHECK_HIP_ERROR(dS.memcheck()); + CHECK_HIP_ERROR(dA.memcheck()); + + // check bad arguments + lasr_checkBadArgs(handle, side, pivot, direct, m, n, dC.data(), dS.data(), dA.data(), lda); +} + +template +void lasr_initData(const rocblas_handle handle, + const rocblas_side side, + const rocblas_pivot pivot, + const rocblas_direct direct, + const rocblas_int m, + const rocblas_int n, + Sd& dC, + Sd& dS, + Td& dA, + const rocblas_int lda, + Sh& hC, + Sh& hS, + Th& hA) +{ + if(CPU) + { + using S = decltype(std::real(T{})); + rocblas_init(hA, true); + + // construct C and S such that C^2 + S^2 = 1 + rocblas_init(hC, true); + rocblas_int size = (side == rocblas_side_left) ? m - 1 : n - 1; + + for(rocblas_int j = 0; j < size; ++j) + { + S temp = hC[0][j]; + temp = (temp - 5) / 5.0; + hC[0][j] = temp; + hS[0][j] = sqrt(1 - temp * temp); + } + } + + if(GPU) + { + // copy data from CPU to device + CHECK_HIP_ERROR(dC.transfer_from(hC)); + CHECK_HIP_ERROR(dS.transfer_from(hS)); + CHECK_HIP_ERROR(dA.transfer_from(hA)); + } +} + +template +void lasr_getError(const rocblas_handle handle, + const rocblas_side side, + const rocblas_pivot pivot, + const rocblas_direct direct, + const rocblas_int m, + const rocblas_int n, + Sd& dC, + Sd& dS, + Td& dA, + const rocblas_int lda, + Sh& hC, + Sh& hS, + Th& hA, + Th& hAr, + double* max_err) +{ + // initialize data + lasr_initData(handle, side, pivot, direct, m, n, dC, dS, dA, lda, hC, hS, hA); + + // execute computations + // GPU lapack + CHECK_ROCBLAS_ERROR( + rocsolver_lasr(handle, side, pivot, direct, m, n, dC.data(), dS.data(), dA.data(), lda)); + CHECK_HIP_ERROR(hAr.transfer_from(dA)); + + // CPU lapack + cpu_lasr(side, pivot, direct, m, n, hC[0], hS[0], hA[0], lda); + + // error is ||hA - hAr|| / ||hA|| + // (THIS DOES NOT ACCOUNT FOR NUMERICAL REPRODUCIBILITY ISSUES. + // IT MIGHT BE REVISITED IN THE FUTURE) + // using frobenius + *max_err = norm_error('F', m, n, lda, hA[0], hAr[0]); +} + +template +void lasr_getPerfData(const rocblas_handle handle, + const rocblas_side side, + const rocblas_pivot pivot, + const rocblas_direct direct, + const rocblas_int m, + const rocblas_int n, + Sd& dC, + Sd& dS, + Td& dA, + const rocblas_int lda, + Sh& hC, + Sh& hS, + Th& hA, + double* gpu_time_used, + double* cpu_time_used, + const rocblas_int hot_calls, + const int profile, + const bool profile_kernels, + const bool perf) +{ + if(!perf) + { + lasr_initData(handle, side, pivot, direct, m, n, dC, dS, dA, lda, hC, hS, hA); + + // cpu-lapack performance (only if not in perf mode) + *cpu_time_used = get_time_us_no_sync(); + cpu_lasr(side, pivot, direct, m, n, hC[0], hS[0], hA[0], lda); + *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; + } + + lasr_initData(handle, side, pivot, direct, m, n, dC, dS, dA, lda, hC, hS, hA); + + // cold calls + for(int iter = 0; iter < 2; iter++) + { + lasr_initData(handle, side, pivot, direct, m, n, dC, dS, dA, lda, hC, hS, hA); + + CHECK_ROCBLAS_ERROR(rocsolver_lasr(handle, side, pivot, direct, m, n, dC.data(), dS.data(), + dA.data(), lda)); + } + + // gpu-lapack performance + hipStream_t stream; + CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); + double start; + + if(profile > 0) + { + if(profile_kernels) + rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile + | rocblas_layer_mode_ex_log_kernel); + else + rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); + rocsolver_log_set_max_levels(profile); + } + + for(int iter = 0; iter < hot_calls; iter++) + { + lasr_initData(handle, side, pivot, direct, m, n, dC, dS, dA, lda, hC, hS, hA); + + start = get_time_us_sync(stream); + rocsolver_lasr(handle, side, pivot, direct, m, n, dC.data(), dS.data(), dA.data(), lda); + *gpu_time_used += get_time_us_sync(stream) - start; + } + *gpu_time_used /= hot_calls; +} + +template +void testing_lasr(Arguments& argus) +{ + using S = decltype(std::real(T{})); + + // get arguments + rocblas_local_handle handle; + char sideC = argus.get("side"); + char pivotC = argus.get("pivot"); + char directC = argus.get("direct"); + rocblas_int m = argus.get("m"); + rocblas_int n = argus.get("n", m); + rocblas_int lda = argus.get("lda", m); + + rocblas_side side = char2rocblas_side(sideC); + rocblas_pivot pivot = char2rocblas_pivot(pivotC); + rocblas_direct direct = char2rocblas_direct(directC); + rocblas_int hot_calls = argus.iters; + + // check non-supported values + if(side != rocblas_side_left && side != rocblas_side_right) + { + EXPECT_ROCBLAS_STATUS(rocsolver_lasr(handle, side, pivot, direct, m, n, (S*)nullptr, + (S*)nullptr, (T*)nullptr, lda), + rocblas_status_invalid_value); + + if(argus.timing) + rocsolver_bench_inform(inform_invalid_args); + + return; + } + + // check invalid sizes + bool invalid_size = (m < 0 || n < 0 || lda < m); + if(invalid_size) + { + EXPECT_ROCBLAS_STATUS(rocsolver_lasr(handle, side, pivot, direct, m, n, (S*)nullptr, + (S*)nullptr, (T*)nullptr, lda), + rocblas_status_invalid_size); + + if(argus.timing) + rocsolver_bench_inform(inform_invalid_size); + + return; + } + + // determine sizes + bool left = (side == rocblas_side_left); + bool right = (side == rocblas_side_right); + size_t size_CS = 0; + if(left && m > 1) + size_CS = size_t(m - 1); + if(right && n > 1) + size_CS = size_t(n - 1); + size_t size_A = size_t(lda) * n; + size_t size_Ar = (argus.unit_check || argus.norm_check) ? size_A : 0; + + // memory size query is necessary + if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) + { + CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); + CHECK_ALLOC_QUERY(rocsolver_lasr(handle, side, pivot, direct, m, n, (S*)nullptr, + (S*)nullptr, (T*)nullptr, lda)); + + size_t size; + CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); + if(argus.mem_query) + { + rocsolver_bench_inform(inform_mem_query, size); + return; + } + + CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); + } + + // memory allocations + host_strided_batch_vector hA(size_A, 1, size_A, 1); + host_strided_batch_vector hAr(size_Ar, 1, size_Ar, 1); + host_strided_batch_vector hC(size_CS, 1, size_CS, 1); + host_strided_batch_vector hS(size_CS, 1, size_CS, 1); + device_strided_batch_vector dA(size_A, 1, size_A, 1); + device_strided_batch_vector dC(size_CS, 1, size_CS, 1); + device_strided_batch_vector dS(size_CS, 1, size_CS, 1); + if(size_A) + CHECK_HIP_ERROR(dA.memcheck()); + if(size_CS) + { + CHECK_HIP_ERROR(dC.memcheck()); + CHECK_HIP_ERROR(dS.memcheck()); + } + + // check quick return + bool quickreturn = (left && m < 2) || (right && n < 2) || n == 0 || m == 0; + if(quickreturn) + { + EXPECT_ROCBLAS_STATUS( + rocsolver_lasr(handle, side, pivot, direct, m, n, dC.data(), dS.data(), dA.data(), lda), + rocblas_status_success); + + if(argus.timing) + rocsolver_bench_inform(inform_quick_return); + + return; + } + + double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; + + // check computations + if(argus.unit_check || argus.norm_check) + lasr_getError(handle, side, pivot, direct, m, n, dC, dS, dA, lda, hC, hS, hA, hAr, + &max_error); + + // collect performance data + if(argus.timing && hot_calls > 0) + lasr_getPerfData(handle, side, pivot, direct, m, n, dC, dS, dA, lda, hC, hS, hA, + &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, + argus.profile_kernels, argus.perf); + + // validate results for rocsolver-test + // using s * machine_precision as tolerance + rocblas_int s = left ? m : n; + if(argus.unit_check) + ROCSOLVER_TEST_CHECK(T, max_error, s); + + // output results for rocsolver-bench + if(argus.timing) + { + if(!argus.perf) + { + rocsolver_bench_header("Arguments:"); + rocsolver_bench_output("side", "pivot", "direct", "m", "n", "lda"); + rocsolver_bench_output(sideC, pivotC, directC, m, n, lda); + + rocsolver_bench_header("Results:"); + if(argus.norm_check) + { + rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); + rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); + } + else + { + rocsolver_bench_output("cpu_time_us", "gpu_time_us"); + rocsolver_bench_output(cpu_time_used, gpu_time_used); + } + rocsolver_bench_endl(); + } + else + { + if(argus.norm_check) + rocsolver_bench_output(gpu_time_used, max_error); + else + rocsolver_bench_output(gpu_time_used); + } + } + + // ensure all arguments were consumed + argus.validate_consumed(); +} + +#define EXTERN_TESTING_LASR(...) extern template void testing_lasr<__VA_ARGS__>(Arguments&); + +INSTANTIATE(EXTERN_TESTING_LASR, FOREACH_SCALAR_TYPE, APPLY_STAMP) diff --git a/clients/common/misc/lapack_host_reference.cpp b/clients/common/misc/lapack_host_reference.cpp index 9fe576e98..ccb0041fb 100644 --- a/clients/common/misc/lapack_host_reference.cpp +++ b/clients/common/misc/lapack_host_reference.cpp @@ -764,6 +764,27 @@ void zlarfb_(char* side, rocblas_double_complex* W, int* ldw); +void slasr_(char* side, char* pivot, char* direct, int* m, int* n, float* C, float* S, float* A, int* lda); +void dlasr_(char* side, char* pivot, char* direct, int* m, int* n, double* C, double* S, double* A, int* lda); +void clasr_(char* side, + char* pivot, + char* direct, + int* m, + int* n, + float* C, + float* S, + rocblas_float_complex* A, + int* lda); +void zlasr_(char* side, + char* pivot, + char* direct, + int* m, + int* n, + double* C, + double* S, + rocblas_double_complex* A, + int* lda); + void slatrd_(char* uplo, int* n, int* k, float* A, int* lda, float* E, float* tau, float* W, int* ldw); void dlatrd_(char* uplo, int* n, int* k, double* A, int* lda, double* E, double* tau, double* W, int* ldw); void clatrd_(char* uplo, @@ -3082,6 +3103,76 @@ void cpu_larfb(rocblas_side sideR, zlarfb_(&side, &trans, &direct, &storev, &m, &n, &k, V, &ldv, T, &ldt, A, &lda, W, &ldw); } +// lasr + +template <> +void cpu_lasr(rocblas_side sideR, + rocblas_pivot pivotR, + rocblas_direct directR, + rocblas_int m, + rocblas_int n, + float* C, + float* S, + float* A, + rocblas_int lda) +{ + char side = rocblas2char_side(sideR); + char pivot = rocblas2char_pivot(pivotR); + char direct = rocblas2char_direct(directR); + slasr_(&side, &pivot, &direct, &m, &n, C, S, A, &lda); +} + +template <> +void cpu_lasr(rocblas_side sideR, + rocblas_pivot pivotR, + rocblas_direct directR, + rocblas_int m, + rocblas_int n, + double* C, + double* S, + double* A, + rocblas_int lda) +{ + char side = rocblas2char_side(sideR); + char pivot = rocblas2char_pivot(pivotR); + char direct = rocblas2char_direct(directR); + dlasr_(&side, &pivot, &direct, &m, &n, C, S, A, &lda); +} + +template <> +void cpu_lasr(rocblas_side sideR, + rocblas_pivot pivotR, + rocblas_direct directR, + rocblas_int m, + rocblas_int n, + float* C, + float* S, + rocblas_float_complex* A, + rocblas_int lda) +{ + char side = rocblas2char_side(sideR); + char pivot = rocblas2char_pivot(pivotR); + char direct = rocblas2char_direct(directR); + clasr_(&side, &pivot, &direct, &m, &n, C, S, A, &lda); +} + +template <> +void cpu_lasr(rocblas_side sideR, + rocblas_pivot pivotR, + rocblas_direct directR, + rocblas_int m, + rocblas_int n, + double* C, + double* S, + rocblas_double_complex* A, + rocblas_int lda) +{ + char side = rocblas2char_side(sideR); + char pivot = rocblas2char_pivot(pivotR); + char direct = rocblas2char_direct(directR); + zlasr_(&side, &pivot, &direct, &m, &n, C, S, A, &lda); +} + // lauum template <> void cpu_lauum(rocblas_fill uploR, rocblas_int n, float* A, rocblas_int lda) diff --git a/clients/common/misc/lapack_host_reference.hpp b/clients/common/misc/lapack_host_reference.hpp index bc2314ba6..4f3104795 100644 --- a/clients/common/misc/lapack_host_reference.hpp +++ b/clients/common/misc/lapack_host_reference.hpp @@ -297,6 +297,17 @@ void cpu_larfb(rocblas_side side, T* W, rocblas_int ldw); +template +void cpu_lasr(rocblas_side side, + rocblas_pivot pivot, + rocblas_direct direct, + rocblas_int m, + rocblas_int n, + SS* C, + SS* S, + T* A, + rocblas_int lda); + template void cpu_latrd(rocblas_fill uplo, rocblas_int n, diff --git a/clients/common/misc/rocsolver.hpp b/clients/common/misc/rocsolver.hpp index 11a3b6c6a..0fa8a8ea0 100644 --- a/clients/common/misc/rocsolver.hpp +++ b/clients/common/misc/rocsolver.hpp @@ -1291,6 +1291,64 @@ inline rocblas_status rocsolver_larfb(rocblas_handle handle, } /***************************************************************/ +/******************** LASR *************************************/ +inline rocblas_status rocsolver_lasr(rocblas_handle handle, + rocblas_side side, + rocblas_pivot pivot, + rocblas_direct direct, + rocblas_int m, + rocblas_int n, + float* C, + float* S, + float* A, + rocblas_int lda) +{ + return rocsolver_slasr(handle, side, pivot, direct, m, n, C, S, A, lda); +} + +inline rocblas_status rocsolver_lasr(rocblas_handle handle, + rocblas_side side, + rocblas_pivot pivot, + rocblas_direct direct, + rocblas_int m, + rocblas_int n, + double* C, + double* S, + double* A, + rocblas_int lda) +{ + return rocsolver_dlasr(handle, side, pivot, direct, m, n, C, S, A, lda); +} + +inline rocblas_status rocsolver_lasr(rocblas_handle handle, + rocblas_side side, + rocblas_pivot pivot, + rocblas_direct direct, + rocblas_int m, + rocblas_int n, + float* C, + float* S, + rocblas_float_complex* A, + rocblas_int lda) +{ + return rocsolver_clasr(handle, side, pivot, direct, m, n, C, S, A, lda); +} + +inline rocblas_status rocsolver_lasr(rocblas_handle handle, + rocblas_side side, + rocblas_pivot pivot, + rocblas_direct direct, + rocblas_int m, + rocblas_int n, + double* C, + double* S, + rocblas_double_complex* A, + rocblas_int lda) +{ + return rocsolver_zlasr(handle, side, pivot, direct, m, n, C, S, A, lda); +} +/***************************************************************/ + /******************** LAUUM ********************/ inline rocblas_status rocsolver_lauum(rocblas_handle handle, diff --git a/clients/common/misc/rocsolver_arguments.hpp b/clients/common/misc/rocsolver_arguments.hpp index 34006f51a..cfcb887eb 100644 --- a/clients/common/misc/rocsolver_arguments.hpp +++ b/clients/common/misc/rocsolver_arguments.hpp @@ -193,6 +193,17 @@ class Arguments : private std::map throw std::invalid_argument("Invalid value for " + name); } + void validate_pivot(const std::string name) const + { + auto val = find(name); + if(val == end()) + return; + + char pivot = val->second.as(); + if(pivot != 'V' && pivot != 'T' && pivot != 'B') + throw std::invalid_argument("Invalid value for " + name); + } + void validate_storev(const std::string name) const { auto val = find(name); diff --git a/clients/common/misc/rocsolver_dispatcher.hpp b/clients/common/misc/rocsolver_dispatcher.hpp index b875d31cf..6e2947303 100644 --- a/clients/common/misc/rocsolver_dispatcher.hpp +++ b/clients/common/misc/rocsolver_dispatcher.hpp @@ -41,6 +41,7 @@ #include "common/auxiliary/testing_larfb.hpp" #include "common/auxiliary/testing_larfg.hpp" #include "common/auxiliary/testing_larft.hpp" +#include "common/auxiliary/testing_lasr.hpp" #include "common/auxiliary/testing_laswp.hpp" #include "common/auxiliary/testing_lasyf.hpp" #include "common/auxiliary/testing_latrd.hpp" @@ -131,6 +132,7 @@ class rocsolver_dispatcher { // Map for functions that support all precisions static const func_map map = { + // auxiliaries {"laswp", testing_laswp}, {"larfg", testing_larfg}, {"larfg_64", testing_larfg}, @@ -138,6 +140,7 @@ class rocsolver_dispatcher {"larf_64", testing_larf}, {"larft", testing_larft}, {"larfb", testing_larfb}, + {"lasr", testing_lasr}, {"latrd", testing_latrd}, {"labrd", testing_labrd}, {"bdsqr", testing_bdsqr}, @@ -319,6 +322,7 @@ class rocsolver_dispatcher { // Map for functions that support only single and double precisions static const func_map map_real = { + // auxiliaries {"sterf", testing_sterf}, {"stebz", testing_stebz}, {"bdsvdx", testing_bdsvdx}, @@ -427,6 +431,7 @@ class rocsolver_dispatcher { // Map for functions that support only single-complex and double-complex precisions static const func_map map_complex = { + // auxiliaries {"lacgv", testing_lacgv}, {"lacgv_64", testing_lacgv}, // ungxx diff --git a/clients/gtest/CMakeLists.txt b/clients/gtest/CMakeLists.txt index d628beef7..8c8c4ca5c 100755 --- a/clients/gtest/CMakeLists.txt +++ b/clients/gtest/CMakeLists.txt @@ -82,6 +82,8 @@ set(rocauxiliary_test_source auxiliary/larfg_gtest.cpp auxiliary/larft_gtest.cpp auxiliary/larfb_gtest.cpp + # plane rotations + auxiliary/lasr_gtest.cpp # orthonormal/unitary matrices auxiliary/orgxr_ungxr_gtest.cpp auxiliary/orglx_unglx_gtest.cpp diff --git a/clients/gtest/auxiliary/lasr_gtest.cpp b/clients/gtest/auxiliary/lasr_gtest.cpp new file mode 100644 index 000000000..56cbbb72b --- /dev/null +++ b/clients/gtest/auxiliary/lasr_gtest.cpp @@ -0,0 +1,141 @@ +/* ************************************************************************** + * Copyright (C) 2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * *************************************************************************/ + +#include "common/auxiliary/testing_lasr.hpp" + +using ::testing::Combine; +using ::testing::TestWithParam; +using ::testing::Values; +using ::testing::ValuesIn; +using namespace std; + +typedef std::tuple, vector> lasr_tuple; + +// each matrix_size_range vector is a {M,N,lda} + +// each ops_case vector is a {side, pivot, direct} + +// case when m = 0, n = 0, side = 'L', pivot = 'V' and direct = 'F' will also execute the +// bad arguments test (null handle, null pointers and invalid values) + +// for checkin_lapack tests +const vector> matrix_size_range = { + // quick return + {0, 2, 2}, + {2, 0, 2}, + {1, 1, 2}, + // invalid + {-1, 1, 1}, + {1, -1, 1}, + {15, 15, 5}, + // normal (valid) samples + {15, 15, 15}, + {18, 18, 30}, + {25, 18, 25}, + {30, 25, 40}, + {30, 35, 30}, + {40, 50, 50}}; + +const vector> ops_case + = {{'L', 'V', 'F'}, {'L', 'V', 'B'}, {'L', 'T', 'F'}, {'L', 'T', 'B'}, + {'L', 'B', 'F'}, {'L', 'B', 'B'}, {'R', 'V', 'F'}, {'R', 'V', 'B'}, + {'R', 'T', 'F'}, {'R', 'T', 'B'}, {'R', 'B', 'F'}, {'R', 'B', 'B'}}; + +// for daily_lapack tests +const vector> large_matrix_size_range + = {{180, 250, 200}, {500, 350, 500}, {700, 700, 800}, {1024, 1000, 1024}}; + +Arguments lasr_setup_arguments(lasr_tuple tup) +{ + vector size = std::get<0>(tup); + vector ops = std::get<1>(tup); + + Arguments arg; + + arg.set("m", size[0]); + arg.set("n", size[1]); + arg.set("lda", size[2]); + arg.set("side", ops[0]); + arg.set("pivot", ops[1]); + arg.set("direct", ops[2]); + + arg.timing = 0; + + return arg; +} + +class LASR : public ::TestWithParam +{ +protected: + void TearDown() override + { + EXPECT_EQ(hipGetLastError(), hipSuccess); + } + + template + void run_tests() + { + Arguments arg = lasr_setup_arguments(GetParam()); + + if(arg.peek("m") == 0 && arg.peek("n") == 0 + && arg.peek("side") == 'L' && arg.peek("pivot") == 'V' + && arg.peek("direct") == 'F') + testing_lasr_bad_arg(); + + testing_lasr(arg); + } +}; + +// non-batch tests + +TEST_P(LASR, __float) +{ + run_tests(); +} + +TEST_P(LASR, __double) +{ + run_tests(); +} + +TEST_P(LASR, __float_complex) +{ + run_tests(); +} + +TEST_P(LASR, __double_complex) +{ + run_tests(); +} + +INSTANTIATE_TEST_SUITE_P(daily_lapack, + LASR, + Combine(ValuesIn(large_matrix_size_range), ValuesIn(ops_case))); + +INSTANTIATE_TEST_SUITE_P(checkin_lapack, + LASR, + Combine(ValuesIn(matrix_size_range), ValuesIn(ops_case))); diff --git a/common/include/rocsolver_datatype2string.hpp b/common/include/rocsolver_datatype2string.hpp index 501c1f6b9..f83e16cf2 100644 --- a/common/include/rocsolver_datatype2string.hpp +++ b/common/include/rocsolver_datatype2string.hpp @@ -107,6 +107,17 @@ constexpr auto rocblas2char_direct(rocblas_direct value) return '\0'; } +constexpr auto rocblas2char_pivot(rocblas_pivot value) +{ + switch(value) + { + case rocblas_pivot_variable: return 'V'; + case rocblas_pivot_top: return 'T'; + case rocblas_pivot_bottom: return 'B'; + } + return '\0'; +} + constexpr auto rocblas2char_storev(rocblas_storev value) { switch(value) @@ -306,6 +317,17 @@ constexpr rocblas_direct char2rocblas_direct(char value) } } +constexpr rocblas_pivot char2rocblas_pivot(char value) +{ + switch(value) + { + case 'V': return rocblas_pivot_variable; + case 'T': return rocblas_pivot_top; + case 'B': return rocblas_pivot_bottom; + default: return static_cast(0); + } +} + constexpr rocblas_storev char2rocblas_storev(char value) { switch(value) diff --git a/docs/reference/auxiliary.rst b/docs/reference/auxiliary.rst index 18f05a86c..83503e7e2 100644 --- a/docs/reference/auxiliary.rst +++ b/docs/reference/auxiliary.rst @@ -13,6 +13,7 @@ The auxiliary functions are divided into the following categories: * :ref:`vecmat`. Some basic operations with vectors and matrices that are not part of the BLAS standard. * :ref:`householder`. Generation and application of Householder matrices. +* :ref:`rotations`. Generation and application of Givens (plane) rotations. * :ref:`bidiag`. Computations specialized in bidiagonal matrices. * :ref:`tridiag`. Computations specialized in tridiagonal matrices. * :ref:`symmetric`. Computations specialized in symmetric matrices. @@ -163,6 +164,30 @@ rocsolver_larfb() +.. _rotations: + +Givens (plane) rotations +================================== + +.. contents:: List of plane rotations functions + :local: + :backlinks: top + +.. _lasr: + +rocsolver_lasr() +--------------------------------------- +.. doxygenfunction:: rocsolver_zlasr + :outline: +.. doxygenfunction:: rocsolver_clasr + :outline: +.. doxygenfunction:: rocsolver_dlasr + :outline: +.. doxygenfunction:: rocsolver_slasr + + + + .. _bidiag: Bidiagonal forms diff --git a/docs/reference/intro.rst b/docs/reference/intro.rst index 9a62f16d8..a170cb1cb 100644 --- a/docs/reference/intro.rst +++ b/docs/reference/intro.rst @@ -37,6 +37,11 @@ LAPACK auxiliary functions :ref:`rocsolver_larft `, x, x, x, x :ref:`rocsolver_larfb `, x, x, x, x +.. csv-table:: Givens (plane) rotations + :header: "Function", "single", "double", "single complex", "double complex" + + :ref:`rocsolver_lasr `, x, x, x, x + .. csv-table:: Bidiagonal forms :header: "Function", "single", "double", "single complex", "double complex" diff --git a/docs/reference/types.rst b/docs/reference/types.rst index 8191098ba..6706f0e23 100644 --- a/docs/reference/types.rst +++ b/docs/reference/types.rst @@ -19,6 +19,10 @@ rocblas_direct --------------- .. doxygenenum:: rocblas_direct +rocblas_pivot +--------------- +.. doxygenenum:: rocblas_pivot + rocblas_storev --------------- .. doxygenenum:: rocblas_storev diff --git a/library/include/rocsolver/rocsolver-extra-types.h b/library/include/rocsolver/rocsolver-extra-types.h index 89ea80692..a22d62ce9 100644 --- a/library/include/rocsolver/rocsolver-extra-types.h +++ b/library/include/rocsolver/rocsolver-extra-types.h @@ -1,5 +1,5 @@ /* ************************************************************************** - * Copyright (C) 2019-2023 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (C) 2019-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -42,13 +42,13 @@ typedef enum rocblas_layer_mode_ex_ rocblas_layer_mode_ex_log_kernel = 0x10, /**< Enable logging for kernel calls. */ } rocblas_layer_mode_ex; -/*! \brief Used to specify the order in which multiple Householder matrices are - *applied together +/*! \brief Used to specify the order in which multiple Householder matrices or plane rotations are + *applied together (see the documentation of the specific routines for more details). ********************************************************************************/ typedef enum rocblas_direct_ { - rocblas_forward_direction = 171, /**< Householder matrices applied from the right. */ - rocblas_backward_direction = 172, /**< Householder matrices applied from the left. */ + rocblas_forward_direction = 171, /**< Forward ordering. */ + rocblas_backward_direction = 172, /**< Backward ordering. */ } rocblas_direct; /*! \brief Used to specify how householder vectors are stored in a matrix of @@ -164,4 +164,13 @@ typedef enum rocsolver_rfinfo_mode_ = 272, /**< To work with Cholesky factorization (for symmetric positive definite sparse matrices). */ } rocsolver_rfinfo_mode; +/*! \brief Used to specify the planes on which a sequence of Givens rotations is applied. + ********************************************************************************/ +typedef enum rocblas_pivot_ +{ + rocblas_pivot_variable = 281, /**< The i-th rotation is applied on plane (i,i+1). */ + rocblas_pivot_top = 282, /**< The i-th rotation is applied on plane (1,i+1). */ + rocblas_pivot_bottom = 283, /**< The i-th rotation is applied on plane (i,m) or (i,n). */ +} rocblas_pivot; + #endif /* ROCSOLVER_EXTRA_TYPES_H */ diff --git a/library/include/rocsolver/rocsolver-functions.h b/library/include/rocsolver/rocsolver-functions.h index 250d07b2a..1964166d8 100644 --- a/library/include/rocsolver/rocsolver-functions.h +++ b/library/include/rocsolver/rocsolver-functions.h @@ -764,6 +764,129 @@ ROCSOLVER_EXPORT rocblas_status rocsolver_zlarfb(rocblas_handle handle, const rocblas_int lda); //! @} +/*! @{ + \brief LASR applies a sequence of Givens plane rotations, represented as a transformation P, + to a general m-by-n matrix A. + + \details + The transformation P is applied in one of the following forms, depending on + the value of side: + + \f[ + \begin{array}{cl} + PA & \: \text{(No transpose from the left),}\\ + AP^T & \: \text{(Transpose from the right).} + \end{array} + \f] + + P is defined as the product of k plane rotations, with k = m - 1 when applied from the left, and + k = n - 1 when applied from the right, as follows: + + \f[ + \begin{array}{cl} + P = P(1)P(2)\cdots P(k) & \: \text{if direct indicates backward direction, or} \\ + P = P(k)\cdots P(2)P(1) & \: \text{if direct indicates forward direction} + \end{array} + \f] + + Each P(i) is defined by a Givens rotation + + \f[ + R(i) = \left[\begin{array}{cc} + c_i & s_i \\ + -s_i & c_i + \end{array}\right], + \f] + + where the \f$c_i\f$ and \f$s_i\f$ are the corresponding cosine and sine factors. + + The rotations are performed on different planes depending on the value of pivot. If pivot is + variable, the rotation R(i) is performed on plane (i,i+1), i.e. P(i) appears as a rank-2 + modification to the identity matrix in the i-th and (i+1)-th rows and columns. If pivot is + top, then the modification appears in the first and (i+1)-th rows and columns of P(i), + and if pivot is bottom, then the modification appears in the i-th and last rows and columns of P(i). + + All rotations are applied directly without ever forming P(i) explicitly. + + @param[in] + handle rocblas_handle. + @param[in] + side rocblas_side. + Specifies from which side to apply P. + @param[in] + pivot #rocblas_pivot. + Specifies the planes on which the rotations are applied. + @param[in] + direct #rocblas_direct. + Specifies the direction in which the plane rotations are to be applied to generate P. + @param[in] + m rocblas_int. m >= 0. + Number of rows of matrix A. + @param[in] + n rocblas_int. n >= 0. + Number of columns of matrix A. + @param[in] + C pointer to real type. Array on the GPU of size n-1 if side is right, or m-1 + if side is left. + Contains the series of cosine factors defining the Givens rotations. + @param[in] + S pointer to real type. Array on the GPU of size n-1 if side is right, or m-1 + if side is left. + Contains the series of sine factors defining the Givens rotations. + @param[inout] + A pointer to type. Array on the GPU of size lda*n. + On entry, the matrix A. On exit, it is overwritten with + P*A, or A*P^T. + @param[in] + lda rocblas_int. lda >= m. + Leading dimension of A. + ****************************************************************************/ + +ROCSOLVER_EXPORT rocblas_status rocsolver_slasr(rocblas_handle handle, + const rocblas_side side, + const rocblas_pivot pivot, + const rocblas_direct direct, + const rocblas_int m, + const rocblas_int n, + float* C, + float* S, + float* A, + const rocblas_int lda); + +ROCSOLVER_EXPORT rocblas_status rocsolver_dlasr(rocblas_handle handle, + const rocblas_side side, + const rocblas_pivot pivot, + const rocblas_direct direct, + const rocblas_int m, + const rocblas_int n, + double* C, + double* S, + double* A, + const rocblas_int lda); + +ROCSOLVER_EXPORT rocblas_status rocsolver_clasr(rocblas_handle handle, + const rocblas_side side, + const rocblas_pivot pivot, + const rocblas_direct direct, + const rocblas_int m, + const rocblas_int n, + float* C, + float* S, + rocblas_float_complex* A, + const rocblas_int lda); + +ROCSOLVER_EXPORT rocblas_status rocsolver_zlasr(rocblas_handle handle, + const rocblas_side side, + const rocblas_pivot pivot, + const rocblas_direct direct, + const rocblas_int m, + const rocblas_int n, + double* C, + double* S, + rocblas_double_complex* A, + const rocblas_int lda); +//! @} + /*! @{ \brief LABRD computes the bidiagonal form of the first k rows and columns of a general m-by-n matrix A, as well as the matrices X and Y needed to reduce diff --git a/library/src/CMakeLists.txt b/library/src/CMakeLists.txt index 2631d36f2..5a6203457 100755 --- a/library/src/CMakeLists.txt +++ b/library/src/CMakeLists.txt @@ -231,6 +231,8 @@ set(rocsolver_auxiliary_source auxiliary/rocauxiliary_larf.cpp auxiliary/rocauxiliary_larft.cpp auxiliary/rocauxiliary_larfb.cpp + # plane rotations + auxiliary/rocauxiliary_lasr.cpp # orthonormal/unitary matrices auxiliary/rocauxiliary_org2r_ung2r.cpp auxiliary/rocauxiliary_orgqr_ungqr.cpp diff --git a/library/src/auxiliary/rocauxiliary_lasr.cpp b/library/src/auxiliary/rocauxiliary_lasr.cpp new file mode 100644 index 000000000..3b9a3907e --- /dev/null +++ b/library/src/auxiliary/rocauxiliary_lasr.cpp @@ -0,0 +1,141 @@ +/* ************************************************************************** + * Copyright (C) 2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * *************************************************************************/ + +#include "rocauxiliary_lasr.hpp" + +ROCSOLVER_BEGIN_NAMESPACE + +template +rocblas_status rocsolver_lasr_impl(rocblas_handle handle, + const rocblas_side side, + const rocblas_pivot pivot, + const rocblas_direct direct, + const rocblas_int m, + const rocblas_int n, + SS* C, + SS* S, + T* A, + const rocblas_int lda) +{ + ROCSOLVER_ENTER_TOP("lasr", "--side", side, "--pivot", pivot, "--direct", direct, "-m", m, "-n", + n, "--lda", lda); + + if(!handle) + return rocblas_status_invalid_handle; + + // argument checking + rocblas_status st = rocsolver_lasr_argCheck(handle, side, pivot, direct, m, n, C, S, A, lda); + if(st != rocblas_status_continue) + return st; + + // working with unshifted arrays + rocblas_int shiftA = 0; + + // normal (non-batched non-strided) execution + rocblas_stride strideC = 0; + rocblas_stride strideS = 0; + rocblas_stride strideA = 0; + rocblas_int batch_count = 1; + + // this function does not require memory work space + if(rocblas_is_device_memory_size_query(handle)) + return rocblas_status_size_unchanged; + + // execution + return rocsolver_lasr_template(handle, side, pivot, direct, m, n, C, strideC, S, strideS, A, + shiftA, lda, strideA, batch_count); +} + +ROCSOLVER_END_NAMESPACE + +/* + * =========================================================================== + * C wrapper + * =========================================================================== + */ + +extern "C" { + +rocblas_status rocsolver_slasr(rocblas_handle handle, + const rocblas_side side, + const rocblas_pivot pivot, + const rocblas_direct direct, + const rocblas_int m, + const rocblas_int n, + float* C, + float* S, + float* A, + const rocblas_int lda) +{ + return rocsolver::rocsolver_lasr_impl(handle, side, pivot, direct, m, n, C, S, A, lda); +} + +rocblas_status rocsolver_dlasr(rocblas_handle handle, + const rocblas_side side, + const rocblas_pivot pivot, + const rocblas_direct direct, + const rocblas_int m, + const rocblas_int n, + double* C, + double* S, + double* A, + const rocblas_int lda) +{ + return rocsolver::rocsolver_lasr_impl(handle, side, pivot, direct, m, n, C, S, A, lda); +} + +rocblas_status rocsolver_clasr(rocblas_handle handle, + const rocblas_side side, + const rocblas_pivot pivot, + const rocblas_direct direct, + const rocblas_int m, + const rocblas_int n, + float* C, + float* S, + rocblas_float_complex* A, + const rocblas_int lda) +{ + return rocsolver::rocsolver_lasr_impl(handle, side, pivot, direct, m, n, + C, S, A, lda); +} + +rocblas_status rocsolver_zlasr(rocblas_handle handle, + const rocblas_side side, + const rocblas_pivot pivot, + const rocblas_direct direct, + const rocblas_int m, + const rocblas_int n, + double* C, + double* S, + rocblas_double_complex* A, + const rocblas_int lda) +{ + return rocsolver::rocsolver_lasr_impl(handle, side, pivot, direct, m, n, + C, S, A, lda); +} + +} // extern C diff --git a/library/src/auxiliary/rocauxiliary_lasr.hpp b/library/src/auxiliary/rocauxiliary_lasr.hpp new file mode 100644 index 000000000..cf940ce36 --- /dev/null +++ b/library/src/auxiliary/rocauxiliary_lasr.hpp @@ -0,0 +1,479 @@ +/************************************************************************ + * Derived from the BSD3-licensed + * LAPACK routine (version 3.7.0) -- + * Univ. of Tennessee, Univ. of California Berkeley, + * Univ. of Colorado Denver and NAG Ltd.. + * June 2013 + * Copyright (C) 2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * *************************************************************************/ + +#pragma once + +#include "rocblas.hpp" +#include "rocsolver/rocsolver.h" + +ROCSOLVER_BEGIN_NAMESPACE + +#ifndef LASR_MAX_NTHREADS +#define LASR_MAX_NTHREADS 64 +#endif + +/***************** GPU Device functions *****************************************/ +/********************************************************************************/ + +template +__host__ __device__ static void lasr_body(const rocblas_side side, + const rocblas_pivot pivot, + const rocblas_direct direct, + I const m, + I const n, + S* c, + S* s, + T* A, + I const lda, + I const tid, + I const t_inc) +{ + // --------------------- + // determine path case + // --------------------- + const bool is_side_Left = (side == rocblas_side_left); + const bool is_side_Right = (side == rocblas_side_right); + const bool is_pivot_Variable = (pivot == rocblas_pivot_variable); + const bool is_pivot_Bottom = (pivot == rocblas_pivot_bottom); + const bool is_pivot_Top = (pivot == rocblas_pivot_top); + const bool is_direct_Forward = (direct == rocblas_forward_direction); + const bool is_direct_Backward = (direct == rocblas_backward_direction); + + // ------------ + // path cases: + // ------------ + + // ----------------------------- + // A := P*A + // Variable pivot, the plane (k,k+1) + // P = P(z-1) * ... * P(2) * P(1) + // ----------------------------- + if(is_side_Left && is_pivot_Variable && is_direct_Forward) + { + for(I j = tid; j < n; j += t_inc) + { + auto temp = A[idx2D(0, j, lda)]; + for(I i = 0; i <= (m - 2); i++) + { + const auto ctemp = c[i]; + const auto stemp = s[i]; + const auto temp_hold = A[idx2D(i + 1, j, lda)]; + A[idx2D(i, j, lda)] = stemp * temp_hold + ctemp * temp; + temp = ctemp * temp_hold - stemp * temp; + } + A[idx2D(m - 1, j, lda)] = temp; + } + return; + } + + // ----------------------------- + // A := P*A + // Variable pivot, the plane (k,k+1) + // P = P(1)*P(2)*...*P(z-1) + // ----------------------------- + if(is_side_Left && is_pivot_Variable && is_direct_Backward) + { + for(I j = tid; j < n; j += t_inc) + { + auto temp = A[idx2D(m - 1, j, lda)]; + for(I i = (m - 2); i >= 0; i--) + { + const auto ctemp = c[i]; + const auto stemp = s[i]; + const auto temp_hold = A[idx2D(i, j, lda)]; + A[idx2D(i + 1, j, lda)] = ctemp * temp - stemp * temp_hold; + temp = stemp * temp + ctemp * temp_hold; + } + A[idx2D(0, j, lda)] = temp; + } + return; + } + + // ----------------------------- + // A := P*A + // Top pivot, the plane (1,k+1) + // P = P(z-1) * ... * P(2) * P(1) + // ----------------------------- + if(is_side_Left && is_pivot_Top && is_direct_Forward) + { + for(I j = tid; j < n; j += t_inc) + { + auto temp_hold = A[idx2D(0, j, lda)]; + for(I i = 1; i <= (m - 1); i++) + { + const auto ctemp = c[i - 1]; + const auto stemp = s[i - 1]; + const auto temp = A[idx2D(i, j, lda)]; + A[idx2D(i, j, lda)] = ctemp * temp - stemp * temp_hold; + temp_hold = stemp * temp + ctemp * temp_hold; + } + A[idx2D(0, j, lda)] = temp_hold; + } + return; + } + + // ----------------------------- + // A := P*A + // Top pivot, the plane (1,k+1) + // P = P(1)*P(2)*...*P(z-1) + // ----------------------------- + if(is_side_Left && is_pivot_Top && is_direct_Backward) + { + for(I j = tid; j < n; j += t_inc) + { + auto temp_hold = A[idx2D(0, j, lda)]; + for(I i = (m - 1); i >= 1; i--) + { + const auto ctemp = c[i - 1]; + const auto stemp = s[i - 1]; + const auto temp = A[idx2D(i, j, lda)]; + A[idx2D(i, j, lda)] = ctemp * temp - stemp * temp_hold; + temp_hold = stemp * temp + ctemp * temp_hold; + } + A[idx2D(0, j, lda)] = temp_hold; + } + return; + } + + // ----------------------------- + // A := P*A + // Bottom pivot, the plane (k,z) + // P = P(z-1) * ... * P(2) * P(1) + // ----------------------------- + if(is_side_Left && is_pivot_Bottom && is_direct_Forward) + { + for(I j = tid; j < n; j += t_inc) + { + auto temp_hold = A[idx2D(m - 1, j, lda)]; + for(I i = 0; i <= (m - 2); i++) + { + const auto ctemp = c[i]; + const auto stemp = s[i]; + const auto temp = A[idx2D(i, j, lda)]; + A[idx2D(i, j, lda)] = stemp * temp_hold + ctemp * temp; + temp_hold = ctemp * temp_hold - stemp * temp; + } + A[idx2D(m - 1, j, lda)] = temp_hold; + } + return; + } + + // ----------------------------- + // A := P*A + // Bottom pivot, the plane (k,z) + // P = P(1)*P(2)*...*P(z-1) + // ----------------------------- + if(is_side_Left && is_pivot_Bottom && is_direct_Backward) + { + for(I j = tid; j < n; j += t_inc) + { + auto temp_hold = A[idx2D(m - 1, j, lda)]; + for(I i = (m - 2); i >= 0; i--) + { + const auto ctemp = c[i]; + const auto stemp = s[i]; + const auto temp = A[idx2D(i, j, lda)]; + A[idx2D(i, j, lda)] = stemp * temp_hold + ctemp * temp; + temp_hold = ctemp * temp_hold - stemp * temp; + } + A[idx2D(m - 1, j, lda)] = temp_hold; + } + return; + } + + // ----------------------------- + // A := A*P**T + // Variable pivot, the plane (k,k+1) + // P = P(z-1) * ... * P(2) * P(1) + // ----------------------------- + if(is_side_Right && is_pivot_Variable && is_direct_Forward) + { + for(I i = tid; i < m; i += t_inc) + { + auto temp = A[i]; + for(I j = 0; j <= (n - 2); j++) + { + const auto ctemp = c[j]; + const auto stemp = s[j]; + const auto temp_hold = A[idx2D(i, j + 1, lda)]; + A[idx2D(i, j, lda)] = stemp * temp_hold + ctemp * temp; + temp = ctemp * temp_hold - stemp * temp; + } + A[idx2D(i, n - 1, lda)] = temp; + } + return; + } + + // ----------------------------- + // A := A*P**T + // Variable pivot, the plane (k,k+1) + // P = P(1)*P(2)*...*P(z-1) + // ----------------------------- + if(is_side_Right && is_pivot_Variable && is_direct_Backward) + { + for(I i = tid; i < m; i += t_inc) + { + auto temp = A[idx2D(i, n - 1, lda)]; + for(I j = (n - 2); j >= 0; j--) + { + const auto ctemp = c[j]; + const auto stemp = s[j]; + const auto temp_hold = A[idx2D(i, j, lda)]; + A[idx2D(i, j + 1, lda)] = ctemp * temp - stemp * temp_hold; + temp = stemp * temp + ctemp * temp_hold; + } + A[i] = temp; + } + return; + } + + // ----------------------------- + // A := A*P**T + // Top pivot, the plane (1,k+1) + // P = P(z-1) * ... * P(2) * P(1) + // ----------------------------- + if(is_side_Right && is_pivot_Top && is_direct_Forward) + { + for(I i = tid; i < m; i += t_inc) + { + auto temp_hold = A[i]; + for(I j = 1; j <= (n - 1); j++) + { + const auto ctemp = c[j - 1]; + const auto stemp = s[j - 1]; + const auto temp = A[idx2D(i, j, lda)]; + A[idx2D(i, j, lda)] = ctemp * temp - stemp * temp_hold; + temp_hold = stemp * temp + ctemp * temp_hold; + } + A[i] = temp_hold; + } + return; + } + + // ----------------------------- + // A := A*P**T + // Top pivot, the plane (1,k+1) + // P = P(1)*P(2)*...*P(z-1) + // ----------------------------- + if(is_side_Right && is_pivot_Top && is_direct_Backward) + { + for(I i = tid; i < m; i += t_inc) + { + auto temp_hold = A[i]; + for(I j = (n - 1); j >= 1; j--) + { + const auto ctemp = c[j - 1]; + const auto stemp = s[j - 1]; + const auto temp = A[idx2D(i, j, lda)]; + A[idx2D(i, j, lda)] = ctemp * temp - stemp * temp_hold; + temp_hold = stemp * temp + ctemp * temp_hold; + } + A[i] = temp_hold; + } + return; + } + + // ----------------------------- + // A := A*P**T + // Bottom pivot, the plane (k,z) + // P = P(z-1) * ... * P(2) * P(1) + // ----------------------------- + if(is_side_Right && is_pivot_Bottom && is_direct_Forward) + { + for(I i = tid; i < m; i += t_inc) + { + auto temp_hold = A[idx2D(i, n - 1, lda)]; + for(I j = 0; j <= (n - 2); j++) + { + const auto ctemp = c[j]; + const auto stemp = s[j]; + const auto temp = A[idx2D(i, j, lda)]; + A[idx2D(i, j, lda)] = stemp * temp_hold + ctemp * temp; + temp_hold = ctemp * temp_hold - stemp * temp; + } + A[idx2D(i, n - 1, lda)] = temp_hold; + } + return; + } + + // ----------------------------- + // A := A*P**T + // Bottom pivot, the plane (k,z) + // P = P(1)*P(2)*...*P(z-1) + // ----------------------------- + if(is_side_Right && is_pivot_Bottom && is_direct_Backward) + { + for(I i = tid; i < m; i += t_inc) + { + auto temp_hold = A[idx2D(i, n - 1, lda)]; + for(I j = (n - 2); j >= 0; j--) + { + const auto ctemp = c[j]; + const auto stemp = s[j]; + const auto temp = A[idx2D(i, j, lda)]; + A[idx2D(i, j, lda)] = stemp * temp_hold + ctemp * temp; + temp_hold = ctemp * temp_hold - stemp * temp; + } + A[idx2D(i, n - 1, lda)] = temp_hold; + } + return; + } + + return; +} + +template +__global__ static void __launch_bounds__(LASR_MAX_NTHREADS) + lasr_kernel(const rocblas_side side, + const rocblas_pivot pivot, + const rocblas_direct direct, + I const m, + I const n, + S* CA, + const rocblas_stride strideC, + S* SA, + const rocblas_stride strideS, + U AA, + const rocblas_stride shiftA, + I const lda, + const rocblas_stride strideA, + const I batch_count) +{ + const auto nblocks = hipGridDim_x; + const auto nthreads_per_block = hipBlockDim_x; + const auto nthreads = nblocks * nthreads_per_block; + I const tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; + I const t_inc = nthreads; + + // select batch instance and execute + auto const bid_start = hipBlockIdx_z; + auto const bid_inc = hipGridDim_z; + for(auto bid = bid_start; bid < batch_count; bid += bid_inc) + { + T* A_ = load_ptr_batch(AA, bid, shiftA, strideA); + S* C_ = CA + bid * strideC; + S* S_ = SA + bid * strideS; + + lasr_body(side, pivot, direct, m, n, C_, S_, A_, lda, tid, t_inc); + } +} + +/***************** GPU Device functions *****************************************/ +/********************************************************************************/ + +template +rocblas_status rocsolver_lasr_argCheck(rocblas_handle handle, + const rocblas_side side, + const rocblas_pivot pivot, + const rocblas_direct direct, + const rocblas_int m, + const rocblas_int n, + SS* C, + SS* S, + U A, + const rocblas_int lda) +{ + // order is important for unit tests: + + // 1. invalid/non-supported values + if(side != rocblas_side_left && side != rocblas_side_right) + return rocblas_status_invalid_value; + if(pivot != rocblas_pivot_variable && pivot != rocblas_pivot_top && pivot != rocblas_pivot_bottom) + return rocblas_status_invalid_value; + if(direct != rocblas_backward_direction && direct != rocblas_forward_direction) + return rocblas_status_invalid_value; + + // 2. invalid size + if(m < 0 || n < 0 || lda < m) + return rocblas_status_invalid_size; + + // skip pointer check if querying memory size + if(rocblas_is_device_memory_size_query(handle)) + return rocblas_status_continue; + + // 3. invalid pointers + bool is_side_left = (side == rocblas_side_left); + bool is_side_right = (side == rocblas_side_right); + if(m && n && !A) + return rocblas_status_invalid_pointer; + if(is_side_left && m > 1 && (!C || !S)) + return rocblas_status_invalid_pointer; + if(is_side_right && n > 1 && (!C || !S)) + return rocblas_status_invalid_pointer; + + return rocblas_status_continue; +} + +template +rocblas_status rocsolver_lasr_template(rocblas_handle handle, + const rocblas_side side, + const rocblas_pivot pivot, + const rocblas_direct direct, + const I m, + const I n, + S* CA, + const rocblas_stride strideC, + S* SA, + const rocblas_stride strideS, + U AA, + const rocblas_stride shiftA, + const I lda, + const rocblas_stride strideA, + const I batch_count) +{ + ROCSOLVER_ENTER("lasr", "side:", side, "pivot:", pivot, "direct:", direct, "m:", m, "n:", n, + "shiftA:", shiftA, "lda:", lda, "bc:", batch_count); + + bool is_side_left = (side == rocblas_side_left); + bool is_side_right = (side == rocblas_side_right); + + // quick return + if(m == 0 || n == 0 || batch_count == 0) + return rocblas_status_success; + if((is_side_left && m < 2) || (is_side_right && n < 2)) + return rocblas_status_success; + + hipStream_t stream; + rocblas_get_stream(handle, &stream); + + auto const nthreads = LASR_MAX_NTHREADS; + auto const mn = (is_side_left) ? n : m; + auto const nblocks = (mn - 1) / nthreads + 1; + + hipLaunchKernelGGL((lasr_kernel), dim3(nblocks, 1, batch_count), dim3(nthreads, 1, 1), 0, + stream, side, pivot, direct, m, n, CA, strideC, SA, strideS, AA, shiftA, lda, + strideA, batch_count); + + return rocblas_status_success; +} + +ROCSOLVER_END_NAMESPACE diff --git a/library/src/include/lib_host_helpers.hpp b/library/src/include/lib_host_helpers.hpp index c0fd206be..bca4aae41 100644 --- a/library/src/include/lib_host_helpers.hpp +++ b/library/src/include/lib_host_helpers.hpp @@ -43,12 +43,13 @@ ROCSOLVER_BEGIN_NAMESPACE * =========================================================================== */ -inline int64_t idx2D(const int64_t i, const int64_t j, const int64_t lda) +__device__ __host__ inline int64_t idx2D(const int64_t i, const int64_t j, const int64_t lda) { return j * lda + i; } -inline int64_t idx2D(const int64_t i, const int64_t j, const int64_t inca, const int64_t lda) +__device__ __host__ inline int64_t + idx2D(const int64_t i, const int64_t j, const int64_t inca, const int64_t lda) { return j * lda + i * inca; } diff --git a/library/src/include/rocsolver_logvalue.hpp b/library/src/include/rocsolver_logvalue.hpp index 2de864c37..e908a3b39 100644 --- a/library/src/include/rocsolver_logvalue.hpp +++ b/library/src/include/rocsolver_logvalue.hpp @@ -129,6 +129,16 @@ struct formatter> : formatter } }; +template <> +struct formatter> : formatter +{ + template + auto format(rocsolver_logvalue wrapper, FormatCtx& ctx) ROCSOLVER_FMT_CONST + { + return formatter::format(rocsolver::rocblas2char_pivot(wrapper.value), ctx); + } +}; + template <> struct formatter> : formatter {