Skip to content

Commit 1af991b

Browse files
authored
check cuda error for every kernel launch (k2-fsa#215)
1 parent 18c4040 commit 1af991b

File tree

6 files changed

+51
-39
lines changed

6 files changed

+51
-39
lines changed

k2/csrc/array_ops_inl.h

+8-6
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,7 @@ void Transpose(ContextPtr &c, const Array2<T> &src, Array2<T> *dest) {
149149
// TODO(haowen): limit the number of elements?
150150
K2_CHECK_EQ(rows, dest->Dim1());
151151
K2_CHECK_EQ(cols, dest->Dim0());
152+
if (rows == 0 || cols == 0) return;
152153
int32_t src_elem_stride0 = src.ElemStride0();
153154
int32_t dest_elem_stride0 = dest->ElemStride0();
154155
const T *src_data = src.Data();
@@ -166,10 +167,11 @@ void Transpose(ContextPtr &c, const Array2<T> &src, Array2<T> *dest) {
166167
dim3 block_size(internal::kTransTileDim, internal::kTransBlockRows, 1);
167168
dim3 grid_size(NumBlocks(cols, internal::kTransTileDim),
168169
NumBlocks(rows, internal::kTransTileDim));
169-
internal::TransposeKernel<<<grid_size, block_size, 0, c->GetCudaStream()>>>(
170-
rows, cols, src_elem_stride0, dest_elem_stride0, src_data, dest_data);
171-
auto ret = cudaDeviceSynchronize();
172-
K2_CHECK_CUDA_ERROR(ret);
170+
K2_CUDA_SAFE_CALL(
171+
internal::
172+
TransposeKernel<<<grid_size, block_size, 0, c->GetCudaStream()>>>(
173+
rows, cols, src_elem_stride0, dest_elem_stride0, src_data,
174+
dest_data));
173175
}
174176
}
175177

@@ -415,12 +417,12 @@ void ApplyOpOnArray1(Array1<T> &src, T default_value, Array1<T> *dest) {
415417
void *d_temp_storage = nullptr;
416418
size_t temp_storage_bytes = 0;
417419
// the first time is to determine temporary device storage requirements
418-
K2_CHECK_CUDA_ERROR(cub::DeviceReduce::Reduce(
420+
K2_CUDA_SAFE_CALL(cub::DeviceReduce::Reduce(
419421
d_temp_storage, temp_storage_bytes, src_data, dest_data, size, op,
420422
default_value, c->GetCudaStream()));
421423
void *deleter_context;
422424
d_temp_storage = c->Allocate(temp_storage_bytes, &deleter_context);
423-
K2_CHECK_CUDA_ERROR(cub::DeviceReduce::Reduce(
425+
K2_CUDA_SAFE_CALL(cub::DeviceReduce::Reduce(
424426
d_temp_storage, temp_storage_bytes, src_data, dest_data, size, op,
425427
default_value, c->GetCudaStream()));
426428
}

k2/csrc/context.h

+6-10
Original file line numberDiff line numberDiff line change
@@ -404,9 +404,8 @@ void Eval(cudaStream_t stream, int32_t n, LambdaT &lambda) {
404404
} else {
405405
int32_t block_size = 256;
406406
int32_t grid_size = NumBlocks(n, block_size);
407-
eval_lambda<LambdaT><<<grid_size, block_size, 0, stream>>>(n, lambda);
408-
auto err = cudaGetLastError();
409-
K2_DCHECK_CUDA_ERROR(err);
407+
K2_CUDA_SAFE_CALL(eval_lambda<LambdaT>
408+
<<<grid_size, block_size, 0, stream>>>(n, lambda));
410409
}
411410
}
412411

@@ -431,10 +430,8 @@ void Eval(cudaStream_t stream, T *data, int32_t n, LambdaT &lambda) {
431430
} else {
432431
int32_t block_size = 256;
433432
int32_t grid_size = NumBlocks(n, block_size);
434-
eval_lambda<T, LambdaT>
435-
<<<grid_size, block_size, 0, stream>>>(data, n, lambda);
436-
auto err = cudaGetLastError();
437-
K2_DCHECK_CUDA_ERROR(err);
433+
K2_CUDA_SAFE_CALL(eval_lambda<T, LambdaT>
434+
<<<grid_size, block_size, 0, stream>>>(data, n, lambda));
438435
}
439436
}
440437

@@ -472,9 +469,8 @@ void Eval2(cudaStream_t stream, int32_t m, int32_t n, LambdaT &lambda) {
472469
// GetBlockSizesForSimpleMatrixOperation().
473470
dim3 block_size(16, 16, 1);
474471
dim3 grid_size(NumBlocks(n, 16), NumBlocks(m, 16));
475-
eval_lambda2<<<grid_size, block_size, 0, stream>>>(m, n, lambda);
476-
auto err = cudaGetLastError();
477-
K2_DCHECK_CUDA_ERROR(err);
472+
K2_CUDA_SAFE_CALL(
473+
eval_lambda2<<<grid_size, block_size, 0, stream>>>(m, n, lambda));
478474
}
479475
}
480476

k2/csrc/log.h

+21-7
Original file line numberDiff line numberDiff line change
@@ -140,7 +140,7 @@ class Logger {
140140

141141
class Voidifier {
142142
public:
143-
K2_CUDA_HOSTDEV void operator&(const Logger &) const {}
143+
K2_CUDA_HOSTDEV void operator&(const Logger &)const {}
144144
};
145145

146146
} // namespace internal
@@ -190,15 +190,22 @@ class Voidifier {
190190
#define K2_LOG(x) \
191191
::k2::internal::Logger(__FILE__, __func__, __LINE__, ::k2::internal::x)
192192

193+
// `x` would be error code returned from any cuda function call or kernel
194+
// launch.
195+
//
196+
// Caution: don't do this:
197+
// K2_CHECK_CUDA_ERROR(cudaGetLastError())
198+
// as it will call `cudaGetLastError` twice and reset the error status.
193199
#define K2_CHECK_CUDA_ERROR(x) \
194200
K2_CHECK_EQ(x, cudaSuccess) << " Error: " << cudaGetErrorString(x) << ". "
195201

196-
#define K2_CUDA_SAFE_CALL(...) \
197-
do { \
198-
__VA_ARGS__; \
199-
cudaError_t e = ::k2::internal::kDisableDebug ? cudaGetLastError() \
200-
: cudaDeviceSynchronize(); \
201-
K2_CHECK_CUDA_ERROR(e); \
202+
// The parameter should be cuda function call or kernel launch.
203+
#define K2_CUDA_SAFE_CALL(...) \
204+
do { \
205+
__VA_ARGS__; \
206+
if (!::k2::internal::kDisableDebug) cudaDeviceSynchronize(); \
207+
cudaError_t e = cudaGetLastError(); \
208+
K2_CHECK_CUDA_ERROR(e); \
202209
} while (0)
203210

204211
// ============================================================
@@ -229,6 +236,13 @@ class Voidifier {
229236
::k2::internal::kDisableDebug ? (void)0 \
230237
: ::k2::internal::Voidifier() & K2_LOG(x)
231238

239+
// `x` would be error code returned from any cuda function call or kernel
240+
// launch.
241+
//
242+
// CAUTION: don't do this:
243+
// auto error = cudaGetLastError();
244+
// K2_DCHECK_CUDA_ERROR(error);
245+
// as you may reset the error status without checking it in release mode.
232246
#define K2_DCHECK_CUDA_ERROR(x) \
233247
::k2::internal::kDisableDebug ? (void)0 : K2_CHECK_CUDA_ERROR(x)
234248

k2/csrc/ragged.cu

+6-6
Original file line numberDiff line numberDiff line change
@@ -193,14 +193,14 @@ int32_t RaggedShape::MaxSize(int32_t axis) {
193193
void *d_temp_storage = nullptr;
194194
size_t temp_storage_bytes = 0;
195195
// the first time is to determine temporary device storage requirements
196-
K2_CHECK_CUDA_ERROR(cub::DeviceReduce::Max(
197-
d_temp_storage, temp_storage_bytes, row_splits_diff, max_value,
198-
num_rows, c->GetCudaStream()));
196+
K2_CUDA_SAFE_CALL(cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes,
197+
row_splits_diff, max_value,
198+
num_rows, c->GetCudaStream()));
199199
void *deleter_context;
200200
d_temp_storage = c->Allocate(temp_storage_bytes, &deleter_context);
201-
K2_CHECK_CUDA_ERROR(cub::DeviceReduce::Max(
202-
d_temp_storage, temp_storage_bytes, row_splits_diff, max_value,
203-
num_rows, c->GetCudaStream()));
201+
K2_CUDA_SAFE_CALL(cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes,
202+
row_splits_diff, max_value,
203+
num_rows, c->GetCudaStream()));
204204
c->Deallocate(d_temp_storage, deleter_context);
205205
// this will convert to memory on CPU
206206
return max_array[0];

k2/csrc/timer.h

+8-8
Original file line numberDiff line numberDiff line change
@@ -18,24 +18,24 @@ namespace k2 {
1818
class Timer {
1919
public:
2020
Timer() {
21-
K2_CHECK_CUDA_ERROR(cudaEventCreate(&time_start_));
22-
K2_CHECK_CUDA_ERROR(cudaEventCreate(&time_end_));
21+
K2_CUDA_SAFE_CALL(cudaEventCreate(&time_start_));
22+
K2_CUDA_SAFE_CALL(cudaEventCreate(&time_end_));
2323
Reset();
2424
}
2525

2626
~Timer() {
27-
K2_CHECK_CUDA_ERROR(cudaEventDestroy(time_start_));
28-
K2_CHECK_CUDA_ERROR(cudaEventDestroy(time_end_));
27+
K2_CUDA_SAFE_CALL(cudaEventDestroy(time_start_));
28+
K2_CUDA_SAFE_CALL(cudaEventDestroy(time_end_));
2929
}
3030

31-
void Reset() { K2_CHECK_CUDA_ERROR(cudaEventRecord(time_start_, 0)); }
31+
void Reset() { K2_CUDA_SAFE_CALL(cudaEventRecord(time_start_, 0)); }
3232

3333
double Elapsed() {
34-
K2_CHECK_CUDA_ERROR(cudaEventRecord(time_end_, 0));
35-
K2_CHECK_CUDA_ERROR(cudaEventSynchronize(time_end_));
34+
K2_CUDA_SAFE_CALL(cudaEventRecord(time_end_, 0));
35+
K2_CUDA_SAFE_CALL(cudaEventSynchronize(time_end_));
3636

3737
float ms_elapsed;
38-
K2_CHECK_CUDA_ERROR(
38+
K2_CUDA_SAFE_CALL(
3939
cudaEventElapsedTime(&ms_elapsed, time_start_, time_end_));
4040
return ms_elapsed / 1e3;
4141
}

k2/csrc/utils_inl.h

+2-2
Original file line numberDiff line numberDiff line change
@@ -43,11 +43,11 @@ void ExclusiveSum(ContextPtr &c, int32_t n, SrcPtr src, DestPtr dest) {
4343
std::size_t temp_storage_bytes = 0;
4444
// since d_temp_storage is nullptr, the following function will compute
4545
// the number of required bytes for d_temp_storage
46-
K2_CHECK_CUDA_ERROR(cub::DeviceScan::ExclusiveSum(
46+
K2_CUDA_SAFE_CALL(cub::DeviceScan::ExclusiveSum(
4747
d_temp_storage, temp_storage_bytes, src, dest, n, c->GetCudaStream()));
4848
void *deleter_context;
4949
d_temp_storage = c->Allocate(temp_storage_bytes, &deleter_context);
50-
K2_CHECK_CUDA_ERROR(cub::DeviceScan::ExclusiveSum(
50+
K2_CUDA_SAFE_CALL(cub::DeviceScan::ExclusiveSum(
5151
d_temp_storage, temp_storage_bytes, src, dest, n, c->GetCudaStream()));
5252
c->Deallocate(d_temp_storage, deleter_context);
5353
}

0 commit comments

Comments
 (0)