-
Notifications
You must be signed in to change notification settings - Fork 0
/
lu_solve_omp_offload.F90
197 lines (172 loc) · 7.59 KB
/
lu_solve_omp_offload.F90
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
!===============================================================================
! Copyright 2021-2022 Intel Corporation.
!
! This software and the related documents are Intel copyrighted materials, and
! your use of them is governed by the express license under which they were
! provided to you (License). Unless the License provides otherwise, you may not
! use, modify, copy, publish, distribute, disclose or transmit this software or
! the related documents without Intel's prior written permission.
!
! This software and the related documents are provided as is, with no express
! or implied warranties, other than those that are expressly stated in the
! License.
!===============================================================================
!
! Content:
! Intel(R) oneAPI Math Kernel Library (oneMKL)
! FORTRAN OpenMP offload examples for solving batched linear systems.
!
! Compile for CPU:
! ifx -i8 -qmkl -free \
! lu_solve_omp_offload_ex1_timer.F90 -o lu_solve_ex1_timer
!
! Compile for GPU:
! ifx -i8 -DMKL_ILP64 -qopenmp -fopenmp-targets=spir64 -fsycl -free \
! lu_solve_omp_offload_ex1_timer.F90 -o lu_solve_ex1_omp_timer \
! -L${MKLROOT}/lib/intel64 -lmkl_sycl -lmkl_intel_ilp64 \
! -lmkl_intel_thread -lmkl_core -liomp5 -lpthread -ldl
!
! Compile with -DSP to use single precision instead of double precision.
!
!*******************************************************************************
!$ include "mkl_omp_offload.f90"
program solve_batched_linear_systems
! Decide whether to use 32- or 64-bit integer type
#if defined(MKL_ILP64)
!$ use onemkl_lapack_omp_offload_ilp64 ! 64-bit
#else
!$ use onemkl_lapack_omp_offload_lp64 ! 32-bit
#endif
implicit none
integer :: n = 64, batch_size = 4096, nrhs = 1, cycles = 5
integer :: lda, stride_a, stride_ipiv
integer :: ldb, stride_b
integer, allocatable :: ipiv(:,:), info(:)
#if defined(SP)
real (kind=4), allocatable :: a(:,:), b(:,:), a_orig(:,:), b_orig(:,:), x(:)
real (kind=4) :: residual, threshold = 1.0e-5
#else
real (kind=8), allocatable :: a(:,:), b(:,:), a_orig(:,:), b_orig(:,:), x(:)
real (kind=8) :: residual, threshold = 1.0d-9
#endif
integer (kind=8) :: start_time, end_time, clock_precision
real (kind=8) :: cycle_time, total_time = 0.0d0
integer :: i, j, c, allocstat, stat
character (len = 132) :: allocmsg
character (len = 32) :: arg1, arg2
! Simple command-line parser with no error checks
do i = 1, command_argument_count(), 2
call get_command_argument(i, arg1)
select case (arg1)
case ('-n')
call get_command_argument(i+1, arg2)
read(arg2, *, iostat=stat) n
case ('-b')
call get_command_argument(i+1, arg2)
read(arg2, *, iostat=stat) batch_size
case ('-r')
call get_command_argument(i+1, arg2)
read(arg2, *, iostat=stat) nrhs
case ('-c')
call get_command_argument(i+1, arg2)
read(arg2, *, iostat=stat) cycles
case default
print *, 'Unrecognized command-line option:', arg1
stop
end select
enddo
print *, 'Matrix dimensions:', n
print *, 'Batch size:', batch_size
print *, 'Number of RHS:', nrhs
print *, 'Number of test cycles:', cycles
lda = n
stride_a = n * lda
stride_ipiv = n
ldb = n
stride_b = n * nrhs
! Allocate memory for linear algebra computations
allocate (a(stride_a, batch_size), b(n, batch_size*nrhs), &
ipiv(stride_ipiv, batch_size), info(batch_size), &
stat = allocstat, errmsg = allocmsg)
if (allocstat > 0) stop trim(allocmsg)
! Allocate memory for error checking
allocate (a_orig(stride_a, batch_size), b_orig(n, batch_size*nrhs), x(n), &
stat = allocstat, errmsg = allocmsg)
if (allocstat > 0) stop trim(allocmsg)
call system_clock(count_rate = clock_precision)
call random_seed()
do c = 1, cycles
! Initialize the matrices with a random number in the interval (-0.5, 0.5)
call random_number(a)
a = 0.5 - a
! Make diagonal band values larger to ensure well-conditioned matrices
do i = 1, n
a(i+(i-1)*lda,:) = a(i+(i-1)*lda,:) + 50.0
if (i .ne. 1) a(i+(i-1)*lda-1,:) = a(i+(i-1)*lda-1,:) + 20.0
if (i .ne. n) a(i+(i-1)*lda+1,:) = a(i+(i-1)*lda+1,:) + 20.0
enddo
! Initialize the RHS with a random number in the interval (-2.5, 2.5)
call random_number(b)
b = 2.5 - (5.0 * b)
a_orig = a
b_orig = b
call system_clock(start_time) ! Start timer
! Compute the LU factorizations and solve the linear systems using OpenMP offload.
! On entry, "a" contains the input matrices. On exit, it contains the factored matrices.
!$omp target data map(tofrom:a) map(from:ipiv) map(from:info)
!$omp dispatch
#if defined(SP)
call sgetrf_batch_strided(n, n, a, lda, stride_a, ipiv, stride_ipiv, batch_size, info)
#else
call dgetrf_batch_strided(n, n, a, lda, stride_a, ipiv, stride_ipiv, batch_size, info)
#endif
!$omp end target data
if (any(info .ne. 0)) then
print *, 'Error: getrf_batch_strided returned with errors.'
stop
else
! Solving the linear systems. On exit, the solutions are stored in b.
!$omp target data map(to:a) map(to:ipiv) map(tofrom: b) map(from:info)
!$omp dispatch
#if defined(SP)
call sgetrs_batch_strided('N', n, nrhs, a, lda, stride_a, ipiv, stride_ipiv, &
b, ldb, stride_b, batch_size, info)
#else
call dgetrs_batch_strided('N', n, nrhs, a, lda, stride_a, ipiv, stride_ipiv, &
b, ldb, stride_b, batch_size, info)
#endif
!$omp end target data
call system_clock(end_time) ! Stop timer
if (any(info .ne. 0)) then
print *, 'Error: getrs_batch_strided returned with errors.'
stop
else
! Compute a_orig*b and compare result to saved RHS
do i = 1, batch_size
do j = 1, nrhs
x = 0.0
#if defined(SP)
call sgemv('N', n, n, 1.0, a_orig(:,i), lda, b(:,(i-1)*nrhs+j), 1, 0.0, x, 1)
#else
call dgemv('N', n, n, 1.0d0, a_orig(:,i), lda, b(:,(i-1)*nrhs+j), 1, 0.0d0, x, 1)
#endif
! Check relative residual
residual = norm2(b_orig(:,(i-1)*nrhs+j) - x(:)) / norm2(b_orig(:,(i-1)*nrhs+j))
if (residual > threshold) then
print *, 'Warning: relative residual of ', residual
endif
enddo
enddo
cycle_time = dble(end_time - start_time) / dble(clock_precision)
print *, 'Computation completed successfully', cycle_time, 'seconds'
! Do not include the JIT compilation overhead from the first cycle.
if (c > 1) then
total_time = total_time + cycle_time
endif
endif
endif
enddo
print *, 'Total time:', total_time, 'seconds'
! Clean up
deallocate (a, b, a_orig, b_orig, x, ipiv, info)
end program solve_batched_linear_systems