diff --git a/.gitignore b/.gitignore
index ed7c7e4..014593a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,4 @@
 build/
+*.mod
+*.o
 *.out
\ No newline at end of file
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 682482f..9ac1453 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,41 +1,18 @@
 cmake_minimum_required(VERSION 3.15)
 
-# set(CMAKE_Fortran_COMPILER nvfortran )
-set(CMAKE_Fortran_COMPILER gfortran )
-project(komodo Fortran)
-
-option(DEBUG "Debug Mode                                 "  off)
-
-if(CMAKE_Fortran_COMPILER_ID  STREQUAL "GNU")
-   set(f90flags "-cpp -fbacktrace -fPIC -ffree-line-length-none")
-
-   if(DEBUG)
-      set(CMAKE_Fortran_FLAGS "-g  -Wall -pedantic -fbounds-check -ffpe-trap=invalid,overflow,underflow ${f90flags}")
-   else()
-      set(CMAKE_Fortran_FLAGS "-O3 ${f90flags}")
-   endif()
-
-   add_definitions(-D__GNU)
-elseif(CMAKE_Fortran_COMPILER_ID STREQUAL "Intel")
-   set(f90flags "-fpp -std08 -assume byterecl -traceback -diag-disable 6009 -fPIC -list-line-length-none")
-
-   if(DEBUG)
-      set(CMAKE_Fortran_FLAGS "-g -warn -ftrapuv -fp-stack-check -check all -check:noarg_temp_created -fpe0 ${f90flags}")
-   else()
-      set(CMAKE_Fortran_FLAGS "-O3 ${f90flags}")
-   endif()
-elseif(CMAKE_Fortran_COMPILER_ID STREQUAL "PGI" OR CMAKE_Fortran_COMPILER_ID STREQUAL "NVHPC")
-   set(f90flags "-Mfree -Mpreprocess -Mbackslash")
-
-   if(DEBUG)
-      set(CMAKE_Fortran_FLAGS "-traceback -Wall -Kieee -Ktrap=fp ${f90flags}")
-   else()
-      set(CMAKE_Fortran_FLAGS "-O3 -fast ${f90flags}")
-   endif()
-
-   add_definitions(-D__NVHPC)
+option(GPU "Use OpenACC GPU          "  OFF)
+
+# Set for debug only
+if(GPU)
+   set(CMAKE_Fortran_FLAGS "-fast -ta=tesla:lineinfo -Minfo=accel,ccff -Mfree -Mpreprocess -Mbackslash")
+   set(CMAKE_Fortran_COMPILER nvfortran )
+else()
+   set(CMAKE_Fortran_FLAGS "-cpp -O3")
+   set(CMAKE_Fortran_COMPILER gfortran )
 endif()
 
+project(komodo Fortran)
+
 file(GLOB SRC_sc ${PROJECT_SOURCE_DIR}/src/*.f90 )
 add_executable(komodo ${SRC_sc})
 
diff --git a/smpl/static/FDM3D b/smpl/static/FDM3D
new file mode 100644
index 0000000..09424fe
--- /dev/null
+++ b/smpl/static/FDM3D
@@ -0,0 +1,56 @@
+! IAEA2D input data with FDM nodal kernel
+
+! Mode card
+%MODE
+FORWARD
+
+! Case card
+%CASE
+IAEA2D
+IAEA2D Benchmark With  16x16 nodes/FA with FDM nodal kernel
+
+%KERN
+FDM
+
+%ITER
+1200 20 1.e-5 1.e-5 20 40 20 80 ! Set fission interpolation interval every 10 outer iteration
+
+! Cross-sections card
+%XSEC
+2  4    ! Number of groups and number of materials
+! sigtr     siga     nu*sigf    sigf       chi    sigs_g1   sigs_g2
+0.222222  0.010120  0.000000  0.000000  1.000000  0.000000  0.020000
+0.833333  0.080032  0.135000  0.135000  0.000000  0.000000  0.000000  ! COMPOSITION 1
+0.222222  0.010120  0.000000  0.000000  1.000000  0.000000  0.020000
+0.833333  0.085032  0.135000  0.135000  0.000000  0.000000  0.000000  ! COMPOSITION 2
+0.222222  0.010120  0.000000  0.000000  1.000000  0.000000  0.020000
+0.833333  0.130032  0.135000  0.135000  0.000000  0.000000  0.000000  ! COMPOSITION 3
+0.166667  0.000160  0.000000  0.000000  1.000000  0.000000  0.040000
+1.111111  0.010024  0.000000  0.000000  0.000000  0.000000  0.000000  ! COMPOSITION 4
+! Geometry card
+%GEOM
+9 9 40                  !nx, ny, nz
+10.0   8*20.0          !x-direction assembly size in cm
+8  8*16                ! 16x16 per FA
+8*20.0 10.0            !y-direction assembly size in cm
+8*16  8                ! 16x16 per FA
+40*1.25                 !z-direction assembly  in cm
+40*1                    !z-direction nodal is not divided
+1                      !np number of planar type
+40*1                    !planar assignment (from bottom to top)
+! Planar
+ 3  2  2  2  3  2  2  1  4
+ 2  2  2  2  2  2  2  1  4
+ 2  2  2  2  2  2  1  1  4
+ 2  2  2  2  2  2  1  4  4
+ 3  2  2  2  3  1  1  4  0
+ 2  2  2  2  1  1  4  4  0
+ 2  2  1  1  1  4  4  0  0
+ 1  1  1  4  4  4  0  0  0
+ 4  4  4  4  0  0  0  0  0
+! Boundary conditions (east), (west), (north), (south), (bottom), (top)
+1 2 2 1 1 1
+
+! Print Card
+%PRNT
+1 1 0 ! print output
diff --git a/src/komodo.f90 b/src/komodo.f90
index c9c9ee2..45fe2ce 100644
--- a/src/komodo.f90
+++ b/src/komodo.f90
@@ -82,11 +82,4 @@ PROGRAM main
 WRITE(*,*)
 WRITE(*,*) "  KOMODO EXIT NORMALLY"
 
-#ifndef __NVHPC
-#ifndef __GNU
-! KOMODO stop to prevent remaining memory not allocated for g95 compiler
-stop
-#endif
-#endif
-
 END PROGRAM
diff --git a/src/mod_cmfd.f90 b/src/mod_cmfd.f90
index b6a7ecd..17cf593 100644
--- a/src/mod_cmfd.f90
+++ b/src/mod_cmfd.f90
@@ -1,6 +1,7 @@
 module CMFD
 
   use sdata, only: dp
+  use gpu
   implicit none
   save
 
@@ -213,6 +214,9 @@ subroutine set_ind()
     end do
     ind%row(nnod+1) = rec
 
+    !$acc enter data copyin(ind)
+    !$acc enter data copyin(ind%row, ind%col)
+
   end subroutine set_ind
 
   !****************************************************************************!
@@ -298,6 +302,19 @@ subroutine matrix_setup(opt)
       end do
     end do
 
+    if (opt > 0) then
+      !$acc enter data copyin(A)
+      do g = 1, ng
+        !$acc enter data copyin(A(g)%elmn)
+      end do
+    else
+      do g = 1, ng
+        !$acc update device(A(g)%elmn(:))
+      end do
+    end if
+
+    
+
   end subroutine matrix_setup
 
   !****************************************************************************!
@@ -467,7 +484,7 @@ subroutine outer(popt)
       call TSrc(g, Ke, bs)
 
       !!!Inner Iteration
-      call bicg(nin, g, bs, f0(:,g))
+      call bicg(nin, g, bs)
     end do
     CALL FSrc (fs0)               !Update fission source
     errn = fs0 - fs0c
@@ -558,7 +575,7 @@ subroutine outer_fs(popt)
       call TSrc(g, Ke, bs)
 
       !!!Inner Iteration
-      call bicg(nin, g, bs, f0(:,g))
+      call bicg(nin, g, bs)
     end do
     CALL FSrc (fs0)               !Update fission source
     errn = fs0 - fs0c
@@ -652,7 +669,7 @@ subroutine outer_ad(popt)
       call TSrcAd(g, Ke, bs)
 
       !!!Inner Iteration
-      call bicg(nin, g, bs, f0(:,g))
+      call bicg(nin, g, bs)
     end do
     CALL FSrcAd (fs0)               !Update fission source
     errn = fs0 - fs0c
@@ -752,7 +769,7 @@ subroutine outer_th()
       call TSrc(g, Ke, bs)
 
       !!!Inner Iteration
-      call bicg(nin, g, bs, f0(:,g))
+      call bicg(nin, g, bs)
     end do
     CALL FSrc (fs0)               !Update fission source
     errn = fs0 - fs0c
@@ -830,7 +847,7 @@ subroutine outer_tr(ht,maxi)
       call TSrcTr(g, bs)
 
       !!!Inner Iteration
-      call bicg(nin, g, bs, f0(:,g))
+      call bicg(nin, g, bs)
     end do
     CALL FSrc (fs0)               !Update fission source
     errn = fs0 - fs0c
@@ -1187,7 +1204,9 @@ end subroutine RelEg
 
   !****************************************************************************!
 
-  subroutine bicg(imax,g,b,x)
+  subroutine bicg(imax,g,b)
+
+    use sdata, only: r, rs, v, p, s, t, tmp, nnod, f0
 
     !Purpose: to solve linear of system equation with BiCGSTAB method
     ! (without preconditioner). Sparse matrix saved in a and indexed in rc.
@@ -1200,38 +1219,67 @@ subroutine bicg(imax,g,b,x)
 
     integer, intent(in)                  :: imax, g  ! Max. number of iteration and group number
     real(dp), dimension(:), intent(in)   :: b   ! source
-    real(dp), dimension(:), intent(inout)  :: x
 
-    real(dp), dimension(size(b, dim=1))  :: r, rs, v, p, s, t
     real(dp)                             :: rho      , rho_prev
     real(dp)                             :: alpha    , omega   , beta, theta
     integer                              :: i
+    logical                              :: first = .true.
 
-    r  = b - sp_matvec(g,x)
-    rs = r
-    rho   = 1.0_dp; alpha = 1.0_dp; omega = 1.0_dp
-    v  = 0.0_dp; p  = 0.0_dp
+    if (first) then
+      call gpu_allocate(r,  nnod)
+      call gpu_allocate(rs, nnod)
+      call gpu_allocate(v,  nnod)
+      call gpu_allocate(p,  nnod)
+      call gpu_allocate(t,  nnod)
+      call gpu_allocate(s,  nnod)
+      call gpu_allocate(tmp,  nnod)
+      !$acc enter data copyin(f0(:,:))
+      first = .false.
+    end if
+
+    !$acc enter data copyin(b)
+
+    ! write(*,*) r(1:3)
+    ! call gpu_initialize(r, 3.0_dp)
+    ! !$acc update self(r)
+    ! write(*,*) r(1:3)
+
+    call sp_matvec(g,f0(:,g),r)
+
+    call xpby(b, -1._dp, r, rs)
+    call xew(rs, r)
+    
+    rho   = 1.0_dp
+    alpha = 1.0_dp
+    omega = 1.0_dp
+    call gpu_initialize(v, 0.0_dp)
+    call gpu_initialize(p, 0.0_dp)
 
     do i = 1, imax
       rho_prev = rho
       rho      = dproduct(rs,r)
       beta     = (rho/rho_prev) * (alpha/omega)
-      p        = r + beta * (p - omega*v)
-      v        = sp_matvec(g,p)
+      call xpby(p, -omega, v, tmp)
+      call xpby(r, beta, tmp, p)
+      call sp_matvec(g,p,v)
       alpha    = rho/dproduct(rs,v)
-      s        = r - alpha*v
-      t        = sp_matvec(g,s)
+      call xpby(r, -alpha, v, s)
+      call sp_matvec(g,s,t)
       theta    = dproduct(t,t)
       omega    = dproduct(t,s)/theta
-      x        = x + alpha*p + omega*s
-      r        = s - omega*t
+      call axpby(alpha, p, omega, s, tmp)
+      call xpby(f0(:,g), 1.0_dp, tmp, r)
+      call xew(r, f0(:,g))
+      call xpby(s, -omega, t, r)
     end do
+    !$acc exit data delete(b)
+    !$acc update self(f0(:,g))
 
   end subroutine bicg
 
  !****************************************************************************!
 
- function sp_matvec(g,x) result(v)
+ subroutine sp_matvec(g,x,vx)
 
    USE sdata, ONLY: A, ind, nnod
 
@@ -1240,28 +1288,32 @@ function sp_matvec(g,x) result(v)
 
    integer, intent(in) :: g                    ! group number of FDM matrix
    real(dp), dimension(:), intent(in)   :: x   ! vector
-   real(dp), dimension(size(x))         :: v   ! resulting vector
+   real(dp), dimension(:)         :: vx   ! resulting vector
 
    integer   :: i, j
    integer   :: row_start, row_end
    real(dp)  :: tmpsum
 
-  v = 0._dp
+  !$acc kernels present(A(g)%elmn, ind, x, vx)
+  !$acc loop device_type(nvidia) gang worker(32)
   do i = 1, nnod
     tmpsum = 0.
     row_start = ind%row(i) + 1
     row_end   = ind%row(i+1)
+    ! vector(8) because at each row there are 7 non zero elements
+    !$acc loop device_type(nvidia) vector(8)  
     do j = row_start, row_end
       tmpsum = tmpsum + A(g)%elmn(j) * x(ind%col(j)) 
     end do
-    v(i) = tmpsum
+    vx(i) = tmpsum
   end do
- end function sp_matvec
+  !$acc end kernels
 
- !****************************************************************************!
+ end subroutine sp_matvec
 
- function dproduct(a,b) result(x)
+ !****************************************************************************!
 
+ real(dp) function dproduct(a,b)
    !purpose: to perform dot product
 
    real(dp), dimension(:), intent(in)   :: a, b   ! vector
@@ -1269,13 +1321,67 @@ function dproduct(a,b) result(x)
 
    integer   :: n
 
+   
+   !$acc kernels present(a, b)
    x = 0._dp
    do n = 1, size(a)
      x = x + a(n) * b(n)
    end do
+   !$acc end kernels
+
+   dproduct = x
 
  end function dproduct
 
   !****************************************************************************!
 
+ subroutine axpby(alpha, x, beta, y, w)
+  implicit none
+  real(dp)              :: alpha, beta, x(:), y(:)
+  real(dp)              :: w(:)
+  integer               :: i, length
+
+  length = size(x)
+  !$acc kernels present(x,y,w)
+  do i=1,length
+    w(i) = alpha*x(i) + beta*y(i)
+  enddo
+  !$acc end kernels
+
+end subroutine
+
+  !****************************************************************************!
+
+subroutine xpby(x, beta, y, w)
+  implicit none
+  real(dp)              :: beta, x(:), y(:)
+  real(dp)              :: w(:)
+  integer               :: i, length
+
+  length = size(x)
+  !$acc kernels present(x,y,w)
+  do i=1,length
+    w(i) = x(i) + beta*y(i)
+  enddo
+  !$acc end kernels
+
+end subroutine
+
+  !****************************************************************************!
+
+subroutine xew(x, w)
+  implicit none
+  real(dp)              :: x(:)
+  real(dp)              :: w(size(x))
+  integer               :: i, length
+
+  length = size(x)
+  !$acc kernels present(x,w)
+  do i=1,length
+    w(i) = x(i)
+  enddo
+  !$acc end kernels
+
+end subroutine
+
 end module
diff --git a/src/mod_data.f90 b/src/mod_data.f90
index 8b0aaed..a49ee59 100644
--- a/src/mod_data.f90
+++ b/src/mod_data.f90
@@ -200,6 +200,7 @@ MODULE sdata
 real(dp)                              :: ndmax               ! Maximum change in nodal coupling coefficients
 character(len=4)                      :: kern = 'SANM'
 integer                               :: im, jm, km
+real(dp), allocatable                 :: r(:), rs(:), v(:), p(:), s(:), t(:), tmp(:)
 
 !Timing
 real(dp) :: fdm_time = 0., nod_time = 0., xs_time = 0., &
diff --git a/src/mod_gpu.f90 b/src/mod_gpu.f90
new file mode 100644
index 0000000..4238047
--- /dev/null
+++ b/src/mod_gpu.f90
@@ -0,0 +1,64 @@
+module gpu
+
+    use sdata, only: dp
+    implicit none
+    save
+
+    interface gpu_allocate
+        procedure allocate_real_1d
+        procedure allocate_real_2d
+    end interface
+
+    interface gpu_initialize
+        procedure initialize_real_1d
+        procedure initialize_real_scalar
+    end interface
+
+    contains
+
+    subroutine allocate_real_1d(x, len1)
+
+        real(dp), allocatable :: x(:)
+        integer, intent(in)   :: len1
+  
+        allocate(x(len1))
+        !$acc enter data create(x)
+    
+    end subroutine
+  
+    subroutine allocate_real_2d(x, len1, len2)
+
+  
+        real(dp), allocatable :: x(:,:)
+        integer, intent(in)   :: len1, len2
+    
+        allocate(x(len1, len2))
+        !$acc enter data create(x)
+    
+    end subroutine
+
+    subroutine initialize_real_1d(x, value)
+
+        real(dp)              :: x(:)
+        real(dp), intent(in)  :: value
+  
+        !$acc kernels present(x)
+        x(:) = value
+        !$acc end kernels
+    
+    end subroutine
+
+    subroutine initialize_real_scalar(x, value)
+
+        real(dp)              :: x
+        real(dp), intent(in)  :: value
+  
+        !$acc kernels present(x)
+        x = value
+        !$acc end kernels
+    
+    end subroutine
+  
+  
+  end module
+  
\ No newline at end of file