|
14 | 14 | #include "../bml_transpose.h"
|
15 | 15 | #include "../bml_copy.h"
|
16 | 16 |
|
| 17 | +#ifdef BML_USE_ELPA |
| 18 | +#include <elpa/elpa.h> |
| 19 | +#include <elpa/elpa_generic.h> |
| 20 | +#include "../dense/bml_allocate_dense.h" |
| 21 | +#ifdef BML_USE_MAGMA |
| 22 | +#include "../../typed.h" |
| 23 | +#include "magma_v2.h" |
| 24 | +#endif |
| 25 | +#endif |
| 26 | + |
17 | 27 | #include <mpi.h>
|
18 | 28 |
|
19 | 29 | #include <complex.h>
|
@@ -139,13 +149,13 @@ void PZHEEVD(
|
139 | 149 | * \param eigenvalues Eigenvalues of A
|
140 | 150 | * \param eigenvectors Eigenvectors of A
|
141 | 151 | */
|
| 152 | +#ifdef BML_USE_SCALAPACK |
142 | 153 | void TYPED_FUNC(
|
143 |
| - bml_diagonalize_distributed2d) ( |
| 154 | + bml_diagonalize_distributed2d_scalapack) ( |
144 | 155 | bml_matrix_distributed2d_t * A,
|
145 | 156 | void *eigenvalues,
|
146 | 157 | bml_matrix_distributed2d_t * eigenvectors)
|
147 | 158 | {
|
148 |
| -#ifdef BML_USE_SCALAPACK |
149 | 159 | REAL_T *typed_eigenvalues = (REAL_T *) eigenvalues;
|
150 | 160 | // distributed2d format uses a row block distribution
|
151 | 161 | char order = 'R';
|
@@ -288,11 +298,195 @@ void TYPED_FUNC(
|
288 | 298 | A->M / A->npcols, sequential);
|
289 | 299 | bml_deallocate(&zmat);
|
290 | 300 | }
|
291 |
| - // transpose eigenvectors to have them stored row-major |
292 |
| - bml_transpose(eigenvectors->matrix); |
| 301 | + return; |
| 302 | +} |
| 303 | +#endif |
| 304 | + |
| 305 | +#ifdef BML_USE_ELPA |
| 306 | +// Yu, V.; Moussa, J.; Kus, P.; Marek, A.; Messmer, P.; Yoon, M.; Lederer, H.; Blum, V. |
| 307 | +// "GPU-Acceleration of the ELPA2 Distributed Eigensolver for Dense Symmetric and Hermitian Eigenproblems", |
| 308 | +// Computer Physics Communications, 262, 2021 |
| 309 | +void TYPED_FUNC( |
| 310 | + bml_diagonalize_distributed2d_elpa) ( |
| 311 | + bml_matrix_distributed2d_t * A, |
| 312 | + void *eigenvalues, |
| 313 | + bml_matrix_distributed2d_t * eigenvectors) |
| 314 | +{ |
| 315 | + char order = 'R'; |
| 316 | + int np_rows = A->nprows; |
| 317 | + int np_cols = A->npcols; |
| 318 | + int my_prow = A->myprow; |
| 319 | + int my_pcol = A->mypcol; |
| 320 | + int my_blacs_ctxt = Csys2blacs_handle(A->comm); |
| 321 | + Cblacs_gridinit(&my_blacs_ctxt, &order, np_rows, np_cols); |
| 322 | + Cblacs_gridinfo(my_blacs_ctxt, &np_rows, &np_cols, &my_prow, &my_pcol); |
| 323 | + |
| 324 | + int na = A->N; |
| 325 | + int na_rows = na / np_rows; |
| 326 | + int na_cols = na / np_cols; |
| 327 | + if (na_rows * np_rows != na) |
| 328 | + { |
| 329 | + LOG_ERROR("Number of MPI tasks/row should divide matrix size\n"); |
| 330 | + } |
| 331 | + //printf("Matrix size: %d\n", na); |
| 332 | + //printf("Number of MPI process rows: %d\n", np_rows); |
| 333 | + //printf("Number of MPI process cols: %d\n", np_cols); |
| 334 | + |
| 335 | + if (elpa_init(ELPA_API_VERSION) != ELPA_OK) |
| 336 | + { |
| 337 | + LOG_ERROR("Error: ELPA API version not supported"); |
| 338 | + } |
| 339 | + |
| 340 | + int error_elpa; |
| 341 | + elpa_t handle = elpa_allocate(&error_elpa); |
| 342 | + /* Set parameters */ |
| 343 | + elpa_set(handle, "na", (int) na, &error_elpa); |
| 344 | + assert(error_elpa == ELPA_OK); |
| 345 | + |
| 346 | + elpa_set(handle, "nev", (int) na, &error_elpa); |
| 347 | + assert(error_elpa == ELPA_OK); |
| 348 | + |
| 349 | + elpa_set(handle, "local_nrows", (int) na_rows, &error_elpa); |
| 350 | + assert(error_elpa == ELPA_OK); |
| 351 | + |
| 352 | + elpa_set(handle, "local_ncols", (int) na_cols, &error_elpa); |
| 353 | + assert(error_elpa == ELPA_OK); |
| 354 | + |
| 355 | + // use one block/MPI task, so sets block size to no. local rows |
| 356 | + elpa_set(handle, "nblk", (int) na_rows, &error_elpa); |
| 357 | + assert(error_elpa == ELPA_OK); |
| 358 | + |
| 359 | + elpa_set(handle, "mpi_comm_parent", (int) (MPI_Comm_c2f(A->comm)), |
| 360 | + &error_elpa); |
| 361 | + assert(error_elpa == ELPA_OK); |
| 362 | + |
| 363 | + elpa_set(handle, "process_row", (int) my_prow, &error_elpa); |
| 364 | + assert(error_elpa == ELPA_OK); |
| 365 | + |
| 366 | + elpa_set(handle, "process_col", (int) my_pcol, &error_elpa); |
| 367 | + assert(error_elpa == ELPA_OK); |
| 368 | + |
| 369 | + MPI_Barrier(MPI_COMM_WORLD); |
| 370 | + |
| 371 | + int success = elpa_setup(handle); |
| 372 | + assert(success == ELPA_OK); |
| 373 | + |
| 374 | + elpa_set(handle, "solver", ELPA_SOLVER_2STAGE, &error_elpa); |
| 375 | + assert(error_elpa == ELPA_OK); |
| 376 | + |
| 377 | + elpa_set(handle, "gpu", 1, &error_elpa); |
| 378 | + assert(error_elpa == ELPA_OK); |
| 379 | + |
| 380 | + bml_matrix_t *Alocal = A->matrix; |
| 381 | + |
| 382 | + bml_matrix_t *zmat = NULL; |
| 383 | + bml_matrix_t *amat = NULL; |
| 384 | + if (bml_get_type(Alocal) == dense) |
| 385 | + { |
| 386 | + amat = bml_copy_new(Alocal); |
| 387 | + zmat = eigenvectors->matrix; |
| 388 | + } |
| 389 | + else |
| 390 | + { |
| 391 | + LOG_INFO("WARNING: convert local matrices to dense...\n"); |
| 392 | + // convert local matrix to dense |
| 393 | + amat = bml_convert(Alocal, dense, A->matrix_precision, |
| 394 | + -1, sequential); |
| 395 | + zmat = bml_convert(eigenvectors->matrix, dense, A->matrix_precision, |
| 396 | + -1, sequential); |
| 397 | + } |
| 398 | + |
| 399 | + // transpose to satisfy column major ELPA convention |
| 400 | + // (global matrix assumed symmetric, so no need for communications) |
| 401 | + if (A->myprow != A->mypcol) |
| 402 | + bml_transpose(amat); |
| 403 | + |
| 404 | + REAL_T *z = bml_get_data_ptr(zmat); |
| 405 | + assert(z != NULL); |
| 406 | + REAL_T *a = bml_get_data_ptr(amat); |
| 407 | + assert(a != NULL); |
| 408 | + |
| 409 | + /* Solve EV problem */ |
| 410 | + // interface: see elpa_generic.h |
| 411 | + // handle handle of the ELPA object, which defines the problem |
| 412 | + // a device pointer to matrix a in GPU memory |
| 413 | + // ev on return: pointer to eigenvalues in GPU memory |
| 414 | + // q on return: pointer to eigenvectors in GPU memory |
| 415 | + // error on return the error code, which can be queried with elpa_strerr() |
| 416 | + LOG_DEBUG("Call ELPA eigensolver"); |
| 417 | +#if defined(SINGLE_REAL) || defined(SINGLE_COMPLEX) |
| 418 | + float *ev; |
| 419 | + magma_int_t ret = magma_smalloc(&ev, na); |
| 420 | +#else |
| 421 | + double *ev; |
| 422 | + magma_int_t ret = magma_dmalloc(&ev, na); |
| 423 | +#endif |
| 424 | + assert(ret == MAGMA_SUCCESS); |
| 425 | +#if defined(SINGLE_REAL) |
| 426 | + elpa_eigenvectors_float(handle, a, ev, z, &error_elpa); |
| 427 | +#endif |
| 428 | +#if defined(DOUBLE_REAL) |
| 429 | + elpa_eigenvectors_double(handle, a, ev, z, &error_elpa); |
| 430 | +#endif |
| 431 | +#if defined(SINGLE_COMPLEX) |
| 432 | + elpa_eigenvectors_float_complex(handle, a, ev, z, &error_elpa); |
| 433 | +#endif |
| 434 | +#if defined(DOUBLE_COMPLEX) |
| 435 | + elpa_eigenvectors_double_complex(handle, a, ev, z, &error_elpa); |
| 436 | +#endif |
| 437 | + |
| 438 | + assert(error_elpa == ELPA_OK); |
| 439 | + // copy eigenvalues to CPU |
| 440 | + LOG_DEBUG("copy eigenvalues to CPU"); |
| 441 | +#if defined(SINGLE_REAL) || defined(SINGLE_COMPLEX) |
| 442 | + float *tmp = malloc(na * sizeof(float)); |
| 443 | + magma_sgetvector(na, ev, 1, tmp, 1, bml_queue()); |
| 444 | +#endif |
| 445 | +#if defined(DOUBLE_REAL) || defined(DOUBLE_COMPLEX) |
| 446 | + double *tmp = malloc(na * sizeof(double)); |
| 447 | + magma_dgetvector(na, ev, 1, tmp, 1, bml_queue()); |
| 448 | +#endif |
| 449 | + magma_queue_sync(bml_queue()); |
| 450 | + |
| 451 | + REAL_T *ev_ptr = eigenvalues; |
| 452 | + for (int i = 0; i < A->N; i++) |
| 453 | + ev_ptr[i] = (REAL_T) tmp[i]; |
| 454 | + free(tmp); |
| 455 | + |
| 456 | + magma_free(ev); |
| 457 | + |
| 458 | + bml_deallocate(&amat); |
| 459 | + if (bml_get_type(Alocal) != dense) |
| 460 | + { |
| 461 | + bml_deallocate(&(eigenvectors->matrix)); |
| 462 | + eigenvectors->matrix = |
| 463 | + bml_convert(zmat, bml_get_type(Alocal), A->matrix_precision, |
| 464 | + A->M / A->npcols, sequential); |
| 465 | + bml_deallocate(&zmat); |
| 466 | + } |
| 467 | + |
| 468 | + elpa_deallocate(handle, &error_elpa); |
| 469 | +} |
| 470 | +#endif |
| 471 | + |
| 472 | +void TYPED_FUNC( |
| 473 | + bml_diagonalize_distributed2d) ( |
| 474 | + bml_matrix_distributed2d_t * A, |
| 475 | + void *eigenvalues, |
| 476 | + bml_matrix_distributed2d_t * eigenvectors) |
| 477 | +{ |
| 478 | +#ifdef BML_USE_ELPA |
| 479 | + TYPED_FUNC(bml_diagonalize_distributed2d_elpa) (A, eigenvalues, |
| 480 | + eigenvectors); |
| 481 | +#else |
| 482 | +#ifdef BML_USE_SCALAPACK |
| 483 | + TYPED_FUNC(bml_diagonalize_distributed2d_scalapack) (A, eigenvalues, |
| 484 | + eigenvectors); |
293 | 485 | #else
|
294 | 486 | LOG_ERROR
|
295 | 487 | ("Build with ScaLAPACK required for distributed2d diagonalization\n");
|
296 | 488 | #endif
|
297 |
| - return; |
| 489 | +#endif |
| 490 | + // transpose eigenvectors to have them stored row-major |
| 491 | + bml_transpose(eigenvectors->matrix); |
298 | 492 | }
|
0 commit comments