Skip to content

Commit 95193b1

Browse files
jeanlucf22nicolasbock
authored andcommitted
Couple with ELPA
1 parent cabd693 commit 95193b1

8 files changed

+312
-11
lines changed

CMakeLists.txt

+19
Original file line numberDiff line numberDiff line change
@@ -382,6 +382,25 @@ if(BML_MAGMA)
382382
endif()
383383
endif()
384384

385+
set(BML_ELPA FALSE CACHE BOOL "Whether to use ELPA library")
386+
if(BML_ELPA)
387+
message(STATUS "Search for ELPA in directory ${ELPA_DIR}\n")
388+
find_package(ELPA REQUIRED)
389+
390+
if(${ELPA_FOUND})
391+
message(STATUS "ELPA was found:\n"
392+
" ELPA_INCLUDE_DIRS: ${ELPA_INCLUDE_DIRS}\n"
393+
" ELPA_LIBRARY_DIRS: ${ELPA_LIBRARY_DIRS}\n"
394+
" ELPA_LIBRARIES: ${ELPA_LIBRARIES}"
395+
)
396+
add_definitions(-DBML_USE_ELPA)
397+
include_directories(${ELPA_INCLUDE_DIRS})
398+
link_directories(${CUDAToolkit_LIBRARY_DIR})
399+
link_directories(${ELPA_LIBRARY_DIRS})
400+
list(APPEND LINK_LIBRARIES ${ELPA_LIBRARIES})
401+
endif()
402+
endif()
403+
385404
set(BML_SCALAPACK FALSE CACHE BOOL "Whether to use ScaLAPACK library")
386405
if(BML_SCALAPACK)
387406
add_definitions(-DBML_USE_SCALAPACK)

build.sh

+6
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,8 @@ EOF
9494
echo "BML_XSMM Build with XSMM (default is ${BML_XSMM})"
9595
echo "BML_SCALAPACK Build with SCALAPACK (default is ${BML_SCALAPACK})"
9696
echo "SCALAPACK_LIBRARIES ScaLapack libraries (default is '${SCALAPACK_LIBRARIES}')"
97+
echo "BML_ELPA Build with ELPA (default is ${BML_ELPA})"
98+
echo "ELPA_DIR ELPA directory (default is ${ELPA_DIR})"
9799
echo "BML_ELLBLOCK_MEMPOOL Use ellblock memory pool (default is ${BML_ELLBLOCK_MEMPOOL}"
98100
echo "CUDA_TOOLKIT_ROOT_DIR Path to CUDA dir (default is '${CUDA_TOOLKIT_ROOT_DIR}')"
99101
echo "INTEL_OPT {yes, no} (default is ${INTEL_OPT})"
@@ -125,6 +127,8 @@ set_defaults() {
125127
: ${BLAS_LIBRARIES:=}
126128
: ${LAPACK_LIBRARIES:=}
127129
: ${SCALAPACK_LIBRARIES:=}
130+
: ${BML_ELPA:=no}
131+
: ${ELPA_DIR:=}
128132
: ${BML_TESTING:=yes}
129133
: ${BML_VALGRIND:=no}
130134
: ${BML_COVERAGE:=no}
@@ -214,13 +218,15 @@ configure() {
214218
-DCMAKE_C_COMPILER="${CC}" \
215219
-DCMAKE_CXX_COMPILER="${CXX}" \
216220
-DCMAKE_Fortran_COMPILER="${FC}" \
221+
-DCMAKE_PREFIX_PATH="${ELPA_DIR}" \
217222
${CMAKE_C_FLAGS:+-DCMAKE_C_FLAGS="${CMAKE_C_FLAGS}"} \
218223
${CMAKE_CXX_FLAGS:+-DCMAKE_CXX_FLAGS="${CMAKE_CXX_FLAGS}"} \
219224
${CMAKE_Fortran_FLAGS:+-DCMAKE_Fortran_FLAGS="${CMAKE_Fortran_FLAGS}"} \
220225
-DCMAKE_INSTALL_PREFIX="${INSTALL_DIR}" \
221226
-DBLAS_LIBRARIES="${BLAS_LIBRARIES}" \
222227
-DLAPACK_LIBRARIES="${LAPACK_LIBRARIES}" \
223228
-DSCALAPACK_LIBRARIES="${SCALAPACK_LIBRARIES}" \
229+
-DBML_ELPA="${BML_ELPA}" \
224230
-DBML_OPENMP="${BML_OPENMP}" \
225231
-DMKL_GPU="${MKL_GPU}" \
226232
-DBML_MPI="${BML_MPI}" \

cmake/FindELPA.cmake

+15
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
# - Find the ELPA library
2+
#
3+
# Usage:
4+
# find_package(ELPA [REQUIRED] [QUIET] )
5+
#
6+
# It sets the following variables:
7+
# ELPA_FOUND ... true if elpa is found on the system
8+
# ELPA_LIBRARY_DIRS ... full path to elpa library
9+
# ELPA_INCLUDE_DIRS ... elpa include directory
10+
# ELPA_LIBRARIES ... elpa libraries
11+
12+
13+
find_package(PkgConfig REQUIRED)
14+
pkg_check_modules(ELPA REQUIRED elpa IMPORTED_TARGET)
15+
+51
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
#!/bin/bash
2+
module load cmake
3+
module load cuda
4+
module load gcc/10.2.0
5+
module load essl
6+
module load magma
7+
module load netlib-scalapack
8+
module load netlib-lapack
9+
10+
rm -r build
11+
rm -r install
12+
13+
MY_PATH=$(pwd)
14+
15+
#get jsrun with full path
16+
JSRUN=$(which jsrun)
17+
echo ${JSRUN}
18+
19+
export MAGMA_ROOT=${OLCF_MAGMA_ROOT:="${OLCF_MAGMA_ROOT}"}
20+
export CC=${CC:=mpicc}
21+
export FC=${FC:=mpif90}
22+
export CXX=${CXX:=mpiCC}
23+
export BML_OPENMP=${BML_OPENMP:=yes}
24+
export BML_MPI=${BML_MPI:=yes}
25+
export BML_OMP_OFFLOAD=${BML_OMP_OFFLOAD:=no}
26+
export INSTALL_DIR=${INSTALL_DIR:="${MY_PATH}/install"}
27+
export BML_TESTING=${BML_TESTING:=yes}
28+
export BML_MAGMA=${BML_MAGMA:=yes}
29+
export CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:=Debug}
30+
31+
#set BLAS explicitly, otherwise cmake will pick the serial version of essl
32+
export BLAS_LIBRARIES=${BLAS_LIBRARIES:="$OLCF_ESSL_ROOT/lib64/libesslsmp.so"}
33+
#since essl does not contain all the lapack functions needed, we still need lapack
34+
export LAPACK_LIBRARIES=${LAPACK_LIBRARIES:="$OLCF_NETLIB_LAPACK_ROOT/lib64/liblapack.so"}
35+
export BML_SCALAPACK=${BML_SCALAPACK:=yes}
36+
export SCALAPACK_LIBRARIES=${SCALAPACK_LIBRARIES:="-L$OLCF_NETLIB_SCALAPACK_ROOT/lib -lscalapack"}
37+
38+
export BML_CUDA=${BML_CUDA:=yes}
39+
export BML_ELPA=${BML_ELPA:=yes}
40+
export ELPA_DIR=${ELPA_DIR:=/ccs/proj/csc304/elpa}
41+
export EXTRA_LINK_FLAGS=${EXTRA_LINK_FLAGS:="-lgfortran"}
42+
43+
#use jsrun to run tests on a compute node
44+
export BML_NONMPI_PRECOMMAND=${BML_NONMPI_PRECOMMAND:=${JSRUN}}
45+
export BML_NONMPI_PRECOMMAND_ARGS=${BML_NONMPI_PRECOMMAND_ARGS:="-n1;-a1;-g1;-c7;--smpiargs=off"}
46+
47+
export BML_MPIEXEC_EXECUTABLE=${BML_MPIEXEC_EXECUTABLE:=${JSRUN}}
48+
export BML_MPIEXEC_NUMPROCS_FLAG=${BML_MPIEXEC_NUMPROCS_FLAG:="-n"}
49+
export BML_MPIEXEC_PREFLAGS=${BML_MPIEXEC_PREFLAGS:="-a1;-c4;-bpacked:2;-g1"}
50+
51+
./build.sh install

src/C-interface/distributed2d/bml_diagonalize_distributed2d.h

+12
Original file line numberDiff line numberDiff line change
@@ -28,4 +28,16 @@ void bml_diagonalize_distributed2d_double_complex(
2828
void *eigenvalues,
2929
bml_matrix_distributed2d_t * eigenvectors);
3030

31+
#ifdef BML_USE_ELPA
32+
void bml_diagonalize_distributed2d_elpa_single_real(
33+
bml_matrix_distributed2d_t * A,
34+
void *eigenvalues,
35+
bml_matrix_distributed2d_t * eigenvectors);
36+
37+
void bml_diagonalize_distributed2d_elpa_double_real(
38+
bml_matrix_distributed2d_t * A,
39+
void *eigenvalues,
40+
bml_matrix_distributed2d_t * eigenvectors);
41+
#endif
42+
3143
#endif

src/C-interface/distributed2d/bml_diagonalize_distributed2d_typed.c

+199-5
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,16 @@
1414
#include "../bml_transpose.h"
1515
#include "../bml_copy.h"
1616

17+
#ifdef BML_USE_ELPA
18+
#include <elpa/elpa.h>
19+
#include <elpa/elpa_generic.h>
20+
#include "../dense/bml_allocate_dense.h"
21+
#ifdef BML_USE_MAGMA
22+
#include "../../typed.h"
23+
#include "magma_v2.h"
24+
#endif
25+
#endif
26+
1727
#include <mpi.h>
1828

1929
#include <complex.h>
@@ -139,13 +149,13 @@ void PZHEEVD(
139149
* \param eigenvalues Eigenvalues of A
140150
* \param eigenvectors Eigenvectors of A
141151
*/
152+
#ifdef BML_USE_SCALAPACK
142153
void TYPED_FUNC(
143-
bml_diagonalize_distributed2d) (
154+
bml_diagonalize_distributed2d_scalapack) (
144155
bml_matrix_distributed2d_t * A,
145156
void *eigenvalues,
146157
bml_matrix_distributed2d_t * eigenvectors)
147158
{
148-
#ifdef BML_USE_SCALAPACK
149159
REAL_T *typed_eigenvalues = (REAL_T *) eigenvalues;
150160
// distributed2d format uses a row block distribution
151161
char order = 'R';
@@ -288,11 +298,195 @@ void TYPED_FUNC(
288298
A->M / A->npcols, sequential);
289299
bml_deallocate(&zmat);
290300
}
291-
// transpose eigenvectors to have them stored row-major
292-
bml_transpose(eigenvectors->matrix);
301+
return;
302+
}
303+
#endif
304+
305+
#ifdef BML_USE_ELPA
306+
// Yu, V.; Moussa, J.; Kus, P.; Marek, A.; Messmer, P.; Yoon, M.; Lederer, H.; Blum, V.
307+
// "GPU-Acceleration of the ELPA2 Distributed Eigensolver for Dense Symmetric and Hermitian Eigenproblems",
308+
// Computer Physics Communications, 262, 2021
309+
void TYPED_FUNC(
310+
bml_diagonalize_distributed2d_elpa) (
311+
bml_matrix_distributed2d_t * A,
312+
void *eigenvalues,
313+
bml_matrix_distributed2d_t * eigenvectors)
314+
{
315+
char order = 'R';
316+
int np_rows = A->nprows;
317+
int np_cols = A->npcols;
318+
int my_prow = A->myprow;
319+
int my_pcol = A->mypcol;
320+
int my_blacs_ctxt = Csys2blacs_handle(A->comm);
321+
Cblacs_gridinit(&my_blacs_ctxt, &order, np_rows, np_cols);
322+
Cblacs_gridinfo(my_blacs_ctxt, &np_rows, &np_cols, &my_prow, &my_pcol);
323+
324+
int na = A->N;
325+
int na_rows = na / np_rows;
326+
int na_cols = na / np_cols;
327+
if (na_rows * np_rows != na)
328+
{
329+
LOG_ERROR("Number of MPI tasks/row should divide matrix size\n");
330+
}
331+
//printf("Matrix size: %d\n", na);
332+
//printf("Number of MPI process rows: %d\n", np_rows);
333+
//printf("Number of MPI process cols: %d\n", np_cols);
334+
335+
if (elpa_init(ELPA_API_VERSION) != ELPA_OK)
336+
{
337+
LOG_ERROR("Error: ELPA API version not supported");
338+
}
339+
340+
int error_elpa;
341+
elpa_t handle = elpa_allocate(&error_elpa);
342+
/* Set parameters */
343+
elpa_set(handle, "na", (int) na, &error_elpa);
344+
assert(error_elpa == ELPA_OK);
345+
346+
elpa_set(handle, "nev", (int) na, &error_elpa);
347+
assert(error_elpa == ELPA_OK);
348+
349+
elpa_set(handle, "local_nrows", (int) na_rows, &error_elpa);
350+
assert(error_elpa == ELPA_OK);
351+
352+
elpa_set(handle, "local_ncols", (int) na_cols, &error_elpa);
353+
assert(error_elpa == ELPA_OK);
354+
355+
// use one block/MPI task, so sets block size to no. local rows
356+
elpa_set(handle, "nblk", (int) na_rows, &error_elpa);
357+
assert(error_elpa == ELPA_OK);
358+
359+
elpa_set(handle, "mpi_comm_parent", (int) (MPI_Comm_c2f(A->comm)),
360+
&error_elpa);
361+
assert(error_elpa == ELPA_OK);
362+
363+
elpa_set(handle, "process_row", (int) my_prow, &error_elpa);
364+
assert(error_elpa == ELPA_OK);
365+
366+
elpa_set(handle, "process_col", (int) my_pcol, &error_elpa);
367+
assert(error_elpa == ELPA_OK);
368+
369+
MPI_Barrier(MPI_COMM_WORLD);
370+
371+
int success = elpa_setup(handle);
372+
assert(success == ELPA_OK);
373+
374+
elpa_set(handle, "solver", ELPA_SOLVER_2STAGE, &error_elpa);
375+
assert(error_elpa == ELPA_OK);
376+
377+
elpa_set(handle, "gpu", 1, &error_elpa);
378+
assert(error_elpa == ELPA_OK);
379+
380+
bml_matrix_t *Alocal = A->matrix;
381+
382+
bml_matrix_t *zmat = NULL;
383+
bml_matrix_t *amat = NULL;
384+
if (bml_get_type(Alocal) == dense)
385+
{
386+
amat = bml_copy_new(Alocal);
387+
zmat = eigenvectors->matrix;
388+
}
389+
else
390+
{
391+
LOG_INFO("WARNING: convert local matrices to dense...\n");
392+
// convert local matrix to dense
393+
amat = bml_convert(Alocal, dense, A->matrix_precision,
394+
-1, sequential);
395+
zmat = bml_convert(eigenvectors->matrix, dense, A->matrix_precision,
396+
-1, sequential);
397+
}
398+
399+
// transpose to satisfy column major ELPA convention
400+
// (global matrix assumed symmetric, so no need for communications)
401+
if (A->myprow != A->mypcol)
402+
bml_transpose(amat);
403+
404+
REAL_T *z = bml_get_data_ptr(zmat);
405+
assert(z != NULL);
406+
REAL_T *a = bml_get_data_ptr(amat);
407+
assert(a != NULL);
408+
409+
/* Solve EV problem */
410+
// interface: see elpa_generic.h
411+
// handle handle of the ELPA object, which defines the problem
412+
// a device pointer to matrix a in GPU memory
413+
// ev on return: pointer to eigenvalues in GPU memory
414+
// q on return: pointer to eigenvectors in GPU memory
415+
// error on return the error code, which can be queried with elpa_strerr()
416+
LOG_DEBUG("Call ELPA eigensolver");
417+
#if defined(SINGLE_REAL) || defined(SINGLE_COMPLEX)
418+
float *ev;
419+
magma_int_t ret = magma_smalloc(&ev, na);
420+
#else
421+
double *ev;
422+
magma_int_t ret = magma_dmalloc(&ev, na);
423+
#endif
424+
assert(ret == MAGMA_SUCCESS);
425+
#if defined(SINGLE_REAL)
426+
elpa_eigenvectors_float(handle, a, ev, z, &error_elpa);
427+
#endif
428+
#if defined(DOUBLE_REAL)
429+
elpa_eigenvectors_double(handle, a, ev, z, &error_elpa);
430+
#endif
431+
#if defined(SINGLE_COMPLEX)
432+
elpa_eigenvectors_float_complex(handle, a, ev, z, &error_elpa);
433+
#endif
434+
#if defined(DOUBLE_COMPLEX)
435+
elpa_eigenvectors_double_complex(handle, a, ev, z, &error_elpa);
436+
#endif
437+
438+
assert(error_elpa == ELPA_OK);
439+
// copy eigenvalues to CPU
440+
LOG_DEBUG("copy eigenvalues to CPU");
441+
#if defined(SINGLE_REAL) || defined(SINGLE_COMPLEX)
442+
float *tmp = malloc(na * sizeof(float));
443+
magma_sgetvector(na, ev, 1, tmp, 1, bml_queue());
444+
#endif
445+
#if defined(DOUBLE_REAL) || defined(DOUBLE_COMPLEX)
446+
double *tmp = malloc(na * sizeof(double));
447+
magma_dgetvector(na, ev, 1, tmp, 1, bml_queue());
448+
#endif
449+
magma_queue_sync(bml_queue());
450+
451+
REAL_T *ev_ptr = eigenvalues;
452+
for (int i = 0; i < A->N; i++)
453+
ev_ptr[i] = (REAL_T) tmp[i];
454+
free(tmp);
455+
456+
magma_free(ev);
457+
458+
bml_deallocate(&amat);
459+
if (bml_get_type(Alocal) != dense)
460+
{
461+
bml_deallocate(&(eigenvectors->matrix));
462+
eigenvectors->matrix =
463+
bml_convert(zmat, bml_get_type(Alocal), A->matrix_precision,
464+
A->M / A->npcols, sequential);
465+
bml_deallocate(&zmat);
466+
}
467+
468+
elpa_deallocate(handle, &error_elpa);
469+
}
470+
#endif
471+
472+
void TYPED_FUNC(
473+
bml_diagonalize_distributed2d) (
474+
bml_matrix_distributed2d_t * A,
475+
void *eigenvalues,
476+
bml_matrix_distributed2d_t * eigenvectors)
477+
{
478+
#ifdef BML_USE_ELPA
479+
TYPED_FUNC(bml_diagonalize_distributed2d_elpa) (A, eigenvalues,
480+
eigenvectors);
481+
#else
482+
#ifdef BML_USE_SCALAPACK
483+
TYPED_FUNC(bml_diagonalize_distributed2d_scalapack) (A, eigenvalues,
484+
eigenvectors);
293485
#else
294486
LOG_ERROR
295487
("Build with ScaLAPACK required for distributed2d diagonalization\n");
296488
#endif
297-
return;
489+
#endif
490+
// transpose eigenvectors to have them stored row-major
491+
bml_transpose(eigenvectors->matrix);
298492
}

tests/C-tests/bml_test.c

+5-2
Original file line numberDiff line numberDiff line change
@@ -186,8 +186,11 @@ main(
186186
#ifdef BML_USE_MPI
187187
MPI_Init(&argc, &argv);
188188
bml_init(MPI_COMM_WORLD);
189-
printf("with MPI\n");
190-
int N = 14;
189+
int nranks;
190+
MPI_Comm_size(MPI_COMM_WORLD, &nranks);
191+
LOG_INFO("Testing with MPI\n");
192+
// N=64 seems to be the minimum to have ELPA work
193+
int N = nranks > 1 ? 64 : 13;
191194
#else
192195
bml_init();
193196
int N = 13;

0 commit comments

Comments
 (0)