diff --git a/.gitignore b/.gitignore index ed7c7e4..014593a 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,4 @@ build/ +*.mod +*.o *.out \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt index 682482f..9ac1453 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,41 +1,18 @@ cmake_minimum_required(VERSION 3.15) -# set(CMAKE_Fortran_COMPILER nvfortran ) -set(CMAKE_Fortran_COMPILER gfortran ) -project(komodo Fortran) - -option(DEBUG "Debug Mode " off) - -if(CMAKE_Fortran_COMPILER_ID STREQUAL "GNU") - set(f90flags "-cpp -fbacktrace -fPIC -ffree-line-length-none") - - if(DEBUG) - set(CMAKE_Fortran_FLAGS "-g -Wall -pedantic -fbounds-check -ffpe-trap=invalid,overflow,underflow ${f90flags}") - else() - set(CMAKE_Fortran_FLAGS "-O3 ${f90flags}") - endif() - - add_definitions(-D__GNU) -elseif(CMAKE_Fortran_COMPILER_ID STREQUAL "Intel") - set(f90flags "-fpp -std08 -assume byterecl -traceback -diag-disable 6009 -fPIC -list-line-length-none") - - if(DEBUG) - set(CMAKE_Fortran_FLAGS "-g -warn -ftrapuv -fp-stack-check -check all -check:noarg_temp_created -fpe0 ${f90flags}") - else() - set(CMAKE_Fortran_FLAGS "-O3 ${f90flags}") - endif() -elseif(CMAKE_Fortran_COMPILER_ID STREQUAL "PGI" OR CMAKE_Fortran_COMPILER_ID STREQUAL "NVHPC") - set(f90flags "-Mfree -Mpreprocess -Mbackslash") - - if(DEBUG) - set(CMAKE_Fortran_FLAGS "-traceback -Wall -Kieee -Ktrap=fp ${f90flags}") - else() - set(CMAKE_Fortran_FLAGS "-O3 -fast ${f90flags}") - endif() - - add_definitions(-D__NVHPC) +option(GPU "Use OpenACC GPU " OFF) + +# Set for debug only +if(GPU) + set(CMAKE_Fortran_FLAGS "-fast -ta=tesla:lineinfo -Minfo=accel,ccff -Mfree -Mpreprocess -Mbackslash") + set(CMAKE_Fortran_COMPILER nvfortran ) +else() + set(CMAKE_Fortran_FLAGS "-cpp -O3") + set(CMAKE_Fortran_COMPILER gfortran ) endif() +project(komodo Fortran) + file(GLOB SRC_sc ${PROJECT_SOURCE_DIR}/src/*.f90 ) add_executable(komodo ${SRC_sc}) diff --git a/smpl/static/FDM3D b/smpl/static/FDM3D new file mode 100644 index 0000000..09424fe --- /dev/null +++ b/smpl/static/FDM3D @@ -0,0 +1,56 @@ +! IAEA2D input data with FDM nodal kernel + +! Mode card +%MODE +FORWARD + +! Case card +%CASE +IAEA2D +IAEA2D Benchmark With 16x16 nodes/FA with FDM nodal kernel + +%KERN +FDM + +%ITER +1200 20 1.e-5 1.e-5 20 40 20 80 ! Set fission interpolation interval every 10 outer iteration + +! Cross-sections card +%XSEC +2 4 ! Number of groups and number of materials +! sigtr siga nu*sigf sigf chi sigs_g1 sigs_g2 +0.222222 0.010120 0.000000 0.000000 1.000000 0.000000 0.020000 +0.833333 0.080032 0.135000 0.135000 0.000000 0.000000 0.000000 ! COMPOSITION 1 +0.222222 0.010120 0.000000 0.000000 1.000000 0.000000 0.020000 +0.833333 0.085032 0.135000 0.135000 0.000000 0.000000 0.000000 ! COMPOSITION 2 +0.222222 0.010120 0.000000 0.000000 1.000000 0.000000 0.020000 +0.833333 0.130032 0.135000 0.135000 0.000000 0.000000 0.000000 ! COMPOSITION 3 +0.166667 0.000160 0.000000 0.000000 1.000000 0.000000 0.040000 +1.111111 0.010024 0.000000 0.000000 0.000000 0.000000 0.000000 ! COMPOSITION 4 +! Geometry card +%GEOM +9 9 40 !nx, ny, nz +10.0 8*20.0 !x-direction assembly size in cm +8 8*16 ! 16x16 per FA +8*20.0 10.0 !y-direction assembly size in cm +8*16 8 ! 16x16 per FA +40*1.25 !z-direction assembly in cm +40*1 !z-direction nodal is not divided +1 !np number of planar type +40*1 !planar assignment (from bottom to top) +! Planar + 3 2 2 2 3 2 2 1 4 + 2 2 2 2 2 2 2 1 4 + 2 2 2 2 2 2 1 1 4 + 2 2 2 2 2 2 1 4 4 + 3 2 2 2 3 1 1 4 0 + 2 2 2 2 1 1 4 4 0 + 2 2 1 1 1 4 4 0 0 + 1 1 1 4 4 4 0 0 0 + 4 4 4 4 0 0 0 0 0 +! Boundary conditions (east), (west), (north), (south), (bottom), (top) +1 2 2 1 1 1 + +! Print Card +%PRNT +1 1 0 ! print output diff --git a/src/komodo.f90 b/src/komodo.f90 index c9c9ee2..45fe2ce 100644 --- a/src/komodo.f90 +++ b/src/komodo.f90 @@ -82,11 +82,4 @@ PROGRAM main WRITE(*,*) WRITE(*,*) " KOMODO EXIT NORMALLY" -#ifndef __NVHPC -#ifndef __GNU -! KOMODO stop to prevent remaining memory not allocated for g95 compiler -stop -#endif -#endif - END PROGRAM diff --git a/src/mod_cmfd.f90 b/src/mod_cmfd.f90 index b6a7ecd..17cf593 100644 --- a/src/mod_cmfd.f90 +++ b/src/mod_cmfd.f90 @@ -1,6 +1,7 @@ module CMFD use sdata, only: dp + use gpu implicit none save @@ -213,6 +214,9 @@ subroutine set_ind() end do ind%row(nnod+1) = rec + !$acc enter data copyin(ind) + !$acc enter data copyin(ind%row, ind%col) + end subroutine set_ind !****************************************************************************! @@ -298,6 +302,19 @@ subroutine matrix_setup(opt) end do end do + if (opt > 0) then + !$acc enter data copyin(A) + do g = 1, ng + !$acc enter data copyin(A(g)%elmn) + end do + else + do g = 1, ng + !$acc update device(A(g)%elmn(:)) + end do + end if + + + end subroutine matrix_setup !****************************************************************************! @@ -467,7 +484,7 @@ subroutine outer(popt) call TSrc(g, Ke, bs) !!!Inner Iteration - call bicg(nin, g, bs, f0(:,g)) + call bicg(nin, g, bs) end do CALL FSrc (fs0) !Update fission source errn = fs0 - fs0c @@ -558,7 +575,7 @@ subroutine outer_fs(popt) call TSrc(g, Ke, bs) !!!Inner Iteration - call bicg(nin, g, bs, f0(:,g)) + call bicg(nin, g, bs) end do CALL FSrc (fs0) !Update fission source errn = fs0 - fs0c @@ -652,7 +669,7 @@ subroutine outer_ad(popt) call TSrcAd(g, Ke, bs) !!!Inner Iteration - call bicg(nin, g, bs, f0(:,g)) + call bicg(nin, g, bs) end do CALL FSrcAd (fs0) !Update fission source errn = fs0 - fs0c @@ -752,7 +769,7 @@ subroutine outer_th() call TSrc(g, Ke, bs) !!!Inner Iteration - call bicg(nin, g, bs, f0(:,g)) + call bicg(nin, g, bs) end do CALL FSrc (fs0) !Update fission source errn = fs0 - fs0c @@ -830,7 +847,7 @@ subroutine outer_tr(ht,maxi) call TSrcTr(g, bs) !!!Inner Iteration - call bicg(nin, g, bs, f0(:,g)) + call bicg(nin, g, bs) end do CALL FSrc (fs0) !Update fission source errn = fs0 - fs0c @@ -1187,7 +1204,9 @@ end subroutine RelEg !****************************************************************************! - subroutine bicg(imax,g,b,x) + subroutine bicg(imax,g,b) + + use sdata, only: r, rs, v, p, s, t, tmp, nnod, f0 !Purpose: to solve linear of system equation with BiCGSTAB method ! (without preconditioner). Sparse matrix saved in a and indexed in rc. @@ -1200,38 +1219,67 @@ subroutine bicg(imax,g,b,x) integer, intent(in) :: imax, g ! Max. number of iteration and group number real(dp), dimension(:), intent(in) :: b ! source - real(dp), dimension(:), intent(inout) :: x - real(dp), dimension(size(b, dim=1)) :: r, rs, v, p, s, t real(dp) :: rho , rho_prev real(dp) :: alpha , omega , beta, theta integer :: i + logical :: first = .true. - r = b - sp_matvec(g,x) - rs = r - rho = 1.0_dp; alpha = 1.0_dp; omega = 1.0_dp - v = 0.0_dp; p = 0.0_dp + if (first) then + call gpu_allocate(r, nnod) + call gpu_allocate(rs, nnod) + call gpu_allocate(v, nnod) + call gpu_allocate(p, nnod) + call gpu_allocate(t, nnod) + call gpu_allocate(s, nnod) + call gpu_allocate(tmp, nnod) + !$acc enter data copyin(f0(:,:)) + first = .false. + end if + + !$acc enter data copyin(b) + + ! write(*,*) r(1:3) + ! call gpu_initialize(r, 3.0_dp) + ! !$acc update self(r) + ! write(*,*) r(1:3) + + call sp_matvec(g,f0(:,g),r) + + call xpby(b, -1._dp, r, rs) + call xew(rs, r) + + rho = 1.0_dp + alpha = 1.0_dp + omega = 1.0_dp + call gpu_initialize(v, 0.0_dp) + call gpu_initialize(p, 0.0_dp) do i = 1, imax rho_prev = rho rho = dproduct(rs,r) beta = (rho/rho_prev) * (alpha/omega) - p = r + beta * (p - omega*v) - v = sp_matvec(g,p) + call xpby(p, -omega, v, tmp) + call xpby(r, beta, tmp, p) + call sp_matvec(g,p,v) alpha = rho/dproduct(rs,v) - s = r - alpha*v - t = sp_matvec(g,s) + call xpby(r, -alpha, v, s) + call sp_matvec(g,s,t) theta = dproduct(t,t) omega = dproduct(t,s)/theta - x = x + alpha*p + omega*s - r = s - omega*t + call axpby(alpha, p, omega, s, tmp) + call xpby(f0(:,g), 1.0_dp, tmp, r) + call xew(r, f0(:,g)) + call xpby(s, -omega, t, r) end do + !$acc exit data delete(b) + !$acc update self(f0(:,g)) end subroutine bicg !****************************************************************************! - function sp_matvec(g,x) result(v) + subroutine sp_matvec(g,x,vx) USE sdata, ONLY: A, ind, nnod @@ -1240,28 +1288,32 @@ function sp_matvec(g,x) result(v) integer, intent(in) :: g ! group number of FDM matrix real(dp), dimension(:), intent(in) :: x ! vector - real(dp), dimension(size(x)) :: v ! resulting vector + real(dp), dimension(:) :: vx ! resulting vector integer :: i, j integer :: row_start, row_end real(dp) :: tmpsum - v = 0._dp + !$acc kernels present(A(g)%elmn, ind, x, vx) + !$acc loop device_type(nvidia) gang worker(32) do i = 1, nnod tmpsum = 0. row_start = ind%row(i) + 1 row_end = ind%row(i+1) + ! vector(8) because at each row there are 7 non zero elements + !$acc loop device_type(nvidia) vector(8) do j = row_start, row_end tmpsum = tmpsum + A(g)%elmn(j) * x(ind%col(j)) end do - v(i) = tmpsum + vx(i) = tmpsum end do - end function sp_matvec + !$acc end kernels - !****************************************************************************! + end subroutine sp_matvec - function dproduct(a,b) result(x) + !****************************************************************************! + real(dp) function dproduct(a,b) !purpose: to perform dot product real(dp), dimension(:), intent(in) :: a, b ! vector @@ -1269,13 +1321,67 @@ function dproduct(a,b) result(x) integer :: n + + !$acc kernels present(a, b) x = 0._dp do n = 1, size(a) x = x + a(n) * b(n) end do + !$acc end kernels + + dproduct = x end function dproduct !****************************************************************************! + subroutine axpby(alpha, x, beta, y, w) + implicit none + real(dp) :: alpha, beta, x(:), y(:) + real(dp) :: w(:) + integer :: i, length + + length = size(x) + !$acc kernels present(x,y,w) + do i=1,length + w(i) = alpha*x(i) + beta*y(i) + enddo + !$acc end kernels + +end subroutine + + !****************************************************************************! + +subroutine xpby(x, beta, y, w) + implicit none + real(dp) :: beta, x(:), y(:) + real(dp) :: w(:) + integer :: i, length + + length = size(x) + !$acc kernels present(x,y,w) + do i=1,length + w(i) = x(i) + beta*y(i) + enddo + !$acc end kernels + +end subroutine + + !****************************************************************************! + +subroutine xew(x, w) + implicit none + real(dp) :: x(:) + real(dp) :: w(size(x)) + integer :: i, length + + length = size(x) + !$acc kernels present(x,w) + do i=1,length + w(i) = x(i) + enddo + !$acc end kernels + +end subroutine + end module diff --git a/src/mod_data.f90 b/src/mod_data.f90 index 8b0aaed..a49ee59 100644 --- a/src/mod_data.f90 +++ b/src/mod_data.f90 @@ -200,6 +200,7 @@ MODULE sdata real(dp) :: ndmax ! Maximum change in nodal coupling coefficients character(len=4) :: kern = 'SANM' integer :: im, jm, km +real(dp), allocatable :: r(:), rs(:), v(:), p(:), s(:), t(:), tmp(:) !Timing real(dp) :: fdm_time = 0., nod_time = 0., xs_time = 0., & diff --git a/src/mod_gpu.f90 b/src/mod_gpu.f90 new file mode 100644 index 0000000..4238047 --- /dev/null +++ b/src/mod_gpu.f90 @@ -0,0 +1,64 @@ +module gpu + + use sdata, only: dp + implicit none + save + + interface gpu_allocate + procedure allocate_real_1d + procedure allocate_real_2d + end interface + + interface gpu_initialize + procedure initialize_real_1d + procedure initialize_real_scalar + end interface + + contains + + subroutine allocate_real_1d(x, len1) + + real(dp), allocatable :: x(:) + integer, intent(in) :: len1 + + allocate(x(len1)) + !$acc enter data create(x) + + end subroutine + + subroutine allocate_real_2d(x, len1, len2) + + + real(dp), allocatable :: x(:,:) + integer, intent(in) :: len1, len2 + + allocate(x(len1, len2)) + !$acc enter data create(x) + + end subroutine + + subroutine initialize_real_1d(x, value) + + real(dp) :: x(:) + real(dp), intent(in) :: value + + !$acc kernels present(x) + x(:) = value + !$acc end kernels + + end subroutine + + subroutine initialize_real_scalar(x, value) + + real(dp) :: x + real(dp), intent(in) :: value + + !$acc kernels present(x) + x = value + !$acc end kernels + + end subroutine + + + end module + \ No newline at end of file