From d0e7b782b676ebd5b9f41b2e990b1f190de0afea Mon Sep 17 00:00:00 2001 From: Bibek Ghimire Date: Wed, 20 Nov 2024 17:10:21 +0000 Subject: [PATCH 01/12] enable nhwc in ocl --- test/gtest/bn_bwd.cpp | 8 ++++---- test/gtest/bn_fwd_train.cpp | 8 ++++---- test/gtest/bn_infer.cpp | 8 ++++---- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/test/gtest/bn_bwd.cpp b/test/gtest/bn_bwd.cpp index a84a8a8feb..9857c31f6f 100644 --- a/test/gtest/bn_bwd.cpp +++ b/test/gtest/bn_bwd.cpp @@ -95,7 +95,7 @@ INSTANTIATE_TEST_SUITE_P(Smoke, INSTANTIATE_TEST_SUITE_P(Smoke, GPU_BN_OCL_BWD_Large_FP16, testing::Combine(testing::ValuesIn(NetworkLarge()), - testing::ValuesIn({miopenTensorNCHW}), + testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}), testing::ValuesIn({testBNAPIV2})), TestNameGenerator()); @@ -110,7 +110,7 @@ INSTANTIATE_TEST_SUITE_P(Smoke, INSTANTIATE_TEST_SUITE_P(Smoke, GPU_BN_OCL_BWD_Large_BFP16, testing::Combine(testing::ValuesIn(NetworkLarge()), - testing::ValuesIn({miopenTensorNCHW}), + testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}), testing::ValuesIn({testBNAPIV2})), TestNameGenerator()); @@ -118,14 +118,14 @@ INSTANTIATE_TEST_SUITE_P(Smoke, INSTANTIATE_TEST_SUITE_P(Smoke, GPU_BN_BWD_Small_FP32, testing::Combine(testing::ValuesIn(NetworkSmall()), - testing::ValuesIn({miopenTensorNCHW}), + testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}), testing::ValuesIn({testBNAPIV1})), TestNameGenerator()); INSTANTIATE_TEST_SUITE_P(Smoke, GPU_BN_BWD_Large_FP32, testing::Combine(testing::ValuesIn(NetworkLarge()), - testing::ValuesIn({miopenTensorNCHW}), + testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}), testing::ValuesIn({testBNAPIV2})), TestNameGenerator()); // // fp64 diff --git a/test/gtest/bn_fwd_train.cpp b/test/gtest/bn_fwd_train.cpp index 9b4722aaf8..c9db49e8a3 100644 --- a/test/gtest/bn_fwd_train.cpp +++ b/test/gtest/bn_fwd_train.cpp @@ -101,7 +101,7 @@ INSTANTIATE_TEST_SUITE_P(Smoke, INSTANTIATE_TEST_SUITE_P(Smoke, GPU_BN_OCL_FWD_Train_Large_FP16, testing::Combine(testing::ValuesIn(NetworkLarge()), - testing::ValuesIn({miopenTensorNCHW}), + testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}), testing::ValuesIn({testBNAPIV1, testBNAPIV2})), TestNameGenerator()); @@ -116,7 +116,7 @@ INSTANTIATE_TEST_SUITE_P(Smoke, INSTANTIATE_TEST_SUITE_P(Smoke, GPU_BN_OCL_FWD_Train_Large_BFP16, testing::Combine(testing::ValuesIn(NetworkLarge()), - testing::ValuesIn({miopenTensorNCHW}), + testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}), testing::ValuesIn({testBNAPIV1, testBNAPIV2})), TestNameGenerator()); @@ -124,14 +124,14 @@ INSTANTIATE_TEST_SUITE_P(Smoke, INSTANTIATE_TEST_SUITE_P(Smoke, GPU_BN_FWD_Train_Small_FP32, testing::Combine(testing::ValuesIn(NetworkSmall()), - testing::ValuesIn({miopenTensorNCHW}), + testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}), testing::ValuesIn({testBNAPIV1})), TestNameGenerator()); INSTANTIATE_TEST_SUITE_P(Smoke, GPU_BN_FWD_Train_Large_FP32, testing::Combine(testing::ValuesIn(NetworkLarge()), - testing::ValuesIn({miopenTensorNCHW}), + testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}), testing::ValuesIn({testBNAPIV2})), TestNameGenerator()); // // fp64 diff --git a/test/gtest/bn_infer.cpp b/test/gtest/bn_infer.cpp index 591cbd0b1a..b1027e819c 100644 --- a/test/gtest/bn_infer.cpp +++ b/test/gtest/bn_infer.cpp @@ -102,7 +102,7 @@ INSTANTIATE_TEST_SUITE_P(Smoke, INSTANTIATE_TEST_SUITE_P(Smoke, GPU_BN_OCL_Infer_Large_FP16, testing::Combine(testing::ValuesIn(NetworkLarge()), - testing::ValuesIn({miopenTensorNCHW}), + testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}), testing::ValuesIn({testBNAPIV1, testBNAPIV2})), TestNameGenerator()); // bfp16 @@ -116,7 +116,7 @@ INSTANTIATE_TEST_SUITE_P(Smoke, INSTANTIATE_TEST_SUITE_P(Smoke, GPU_BN_OCL_Infer_Large_BFP16, testing::Combine(testing::ValuesIn(NetworkLarge()), - testing::ValuesIn({miopenTensorNCHW}), + testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}), testing::ValuesIn({testBNAPIV1, testBNAPIV2})), TestNameGenerator()); @@ -124,14 +124,14 @@ INSTANTIATE_TEST_SUITE_P(Smoke, INSTANTIATE_TEST_SUITE_P(Smoke, GPU_BN_Infer_Small_FP32, testing::Combine(testing::ValuesIn(NetworkLarge()), - testing::ValuesIn({miopenTensorNCHW}), + testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}), testing::ValuesIn({testBNAPIV1})), TestNameGenerator()); INSTANTIATE_TEST_SUITE_P(Smoke, GPU_BN_Infer_Large_FP32, testing::Combine(testing::ValuesIn(NetworkSmall()), - testing::ValuesIn({miopenTensorNCHW}), + testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}), testing::ValuesIn({testBNAPIV2})), TestNameGenerator()); // fp64 From 9b62286ec93304728651999078c74571c8087627 Mon Sep 17 00:00:00 2001 From: Bibek Ghimire Date: Thu, 21 Nov 2024 06:25:41 +0000 Subject: [PATCH 02/12] disable nhwc for ocl kernel and only enable ocl kernel for nchw and variant = 2 --- src/ocl/batchnormocl.cpp | 13 ++++---- .../batchnorm/backward_spatial_multiple.cpp | 25 ++++++++++++++ src/solver/batchnorm/forward_inference.cpp | 2 +- .../batchnorm/forward_spatial_multiple.cpp | 33 +++++++++++++++++++ 4 files changed, 65 insertions(+), 8 deletions(-) diff --git a/src/ocl/batchnormocl.cpp b/src/ocl/batchnormocl.cpp index 6232759b0b..956ac99e38 100644 --- a/src/ocl/batchnormocl.cpp +++ b/src/ocl/batchnormocl.cpp @@ -152,9 +152,9 @@ void BatchNormForwardTraining(Handle& handle, }(); const auto solvers = solver::SolverContainer{}; + solver::batchnorm::BnFwdTrainingPerActivation, + solver::batchnorm::BnCKFwdTraining>{}; solvers.ExecutePrimitive(handle, problem, algo, invoke_params); @@ -250,9 +250,8 @@ void BatchNormForwardInference(Handle& handle, }(); const auto algo = AlgorithmName{"miopenBatchNormalizationForwardInference"}; - const auto solvers = solver::SolverContainer{}; + const auto solvers = solver::SolverContainer{}; solvers.ExecutePrimitive(handle, problem, algo, invoke_params); } @@ -395,9 +394,9 @@ void BatchNormBackward(Handle& handle, }(); const auto solvers = solver::SolverContainer{}; + solver::batchnorm::BnBwdTrainingPerActivation, + solver::batchnorm::BnCKBwdBackward>{}; solvers.ExecutePrimitive(handle, problem, algo, invoke_params); diff --git a/src/solver/batchnorm/backward_spatial_multiple.cpp b/src/solver/batchnorm/backward_spatial_multiple.cpp index e26922f478..60fbc203ce 100644 --- a/src/solver/batchnorm/backward_spatial_multiple.cpp +++ b/src/solver/batchnorm/backward_spatial_multiple.cpp @@ -38,9 +38,34 @@ namespace solver { namespace batchnorm { +bool BNBwdIsCaseVariant2(const miopen::batchnorm::ProblemDescription& problem) +{ + int n, c, h, w; + std::tie(n, c, h, w) = tien<4>(problem.GetXDesc().GetLengths()); + + unsigned int in_cstride = h * w; + unsigned int in_nhw = n * in_cstride; + + if(!(in_nhw < (32 * 1024 * 1024) && in_cstride > 1024) && + !(in_nhw < (32 * 1024 * 1024) && in_cstride > 512) && !(in_cstride <= 512)) + { + return true; + } + else + return false; +} + bool BnBwdTrainingSpatialMultiple::IsApplicable( const ExecutionContext& context, const miopen::batchnorm::ProblemDescription& problem) const { + if(!problem.IsLayoutNCHW()) + return false; + // NCHW is Applicable for variant = 2 only + if(!BNBwdIsCaseVariant2(problem)) + { + return false; + } + if(problem.GetDirection() != miopen::batchnorm::Direction::Backward || problem.GetMode() != miopenBNSpatial) return false; diff --git a/src/solver/batchnorm/forward_inference.cpp b/src/solver/batchnorm/forward_inference.cpp index a05fce5105..59893fe3f2 100644 --- a/src/solver/batchnorm/forward_inference.cpp +++ b/src/solver/batchnorm/forward_inference.cpp @@ -41,7 +41,7 @@ namespace batchnorm { bool BnFwdInference::IsApplicable(const ExecutionContext&, const miopen::batchnorm::ProblemDescription& bn_problem) const { - if(bn_problem.IsLayoutNHWC()) + if(!problem.IsLayoutNCHW()) return false; if(bn_problem.GetDirection() != miopen::batchnorm::Direction::ForwardInference) return false; diff --git a/src/solver/batchnorm/forward_spatial_multiple.cpp b/src/solver/batchnorm/forward_spatial_multiple.cpp index 6a2c42743b..cd7adaa72e 100644 --- a/src/solver/batchnorm/forward_spatial_multiple.cpp +++ b/src/solver/batchnorm/forward_spatial_multiple.cpp @@ -40,9 +40,42 @@ namespace solver { namespace batchnorm { +bool BNFwdTrainIsCaseVariant2(const miopen::batchnorm::ProblemDescription& problem) +{ + const auto& xDesc = problem.GetXDesc(); + int n, c, h, w; + std::tie(n, c, h, w) = tien<4>(xDesc.GetLengths()); + unsigned int in_cstride = h * w; + unsigned int in_nhw = n * in_cstride; + bool bfp32parm = xDesc.GetType() == miopenFloat; + bool bfpmixparm = (xDesc.GetType() == miopenHalf || xDesc.GetType() == miopenBFloat16) && + problem.GetBnScale().GetType() == miopenFloat; + + // NCHW is Applicable for variant = 2 only + if((!(n < 3) && + !((in_nhw < 33554432 && in_cstride > 1024) || + ((n >= 256) && (in_cstride > 60) && bfpmixparm) || ((in_cstride > 512) && bfpmixparm)) && + !(in_cstride <= 512)) || + !((n > 768) && (in_cstride > 150) && bfp32parm)) + { + return true; + } + else + return false; +} + bool BnFwdTrainingSpatialMultiple::IsApplicable( const ExecutionContext& context, const miopen::batchnorm::ProblemDescription& problem) const { + if(!problem.IsLayoutNCHW()) + return false; + + if(!BNFwdTrainIsCaseVariant2(problem)) + { + return false; + } + // if NCHW check if variant is 2 else false (for all data type) + // update get solution to not change variant if(problem.GetDirection() != miopen::batchnorm::Direction::ForwardTraining || problem.GetMode() != miopenBNSpatial) return false; From 54ef2726d576e2696492230b83a26080fcf7d0b9 Mon Sep 17 00:00:00 2001 From: Bibek Ghimire Date: Fri, 22 Nov 2024 13:23:31 +0000 Subject: [PATCH 03/12] enable nhwc in bn infer, address review comments --- src/kernels/MIOpenBatchNormFwdInferSpatial.cl | 4 +- .../batchnorm/backward_spatial_multiple.cpp | 9 ++-- src/solver/batchnorm/forward_inference.cpp | 42 +++++++++++++------ .../batchnorm/forward_spatial_multiple.cpp | 22 +++++----- 4 files changed, 48 insertions(+), 29 deletions(-) diff --git a/src/kernels/MIOpenBatchNormFwdInferSpatial.cl b/src/kernels/MIOpenBatchNormFwdInferSpatial.cl index a81db2a03b..638ef47828 100644 --- a/src/kernels/MIOpenBatchNormFwdInferSpatial.cl +++ b/src/kernels/MIOpenBatchNormFwdInferSpatial.cl @@ -43,6 +43,8 @@ MIOpenBatchNormFwdInferSpatialEst(const __global _FLOAT* __restrict in, /* x inp const __global _FLOAT_PREC* __restrict bias, double epsilon, unsigned int batchSize, + unsigned int cLen, + unsigned int cStride, unsigned int imageDims, unsigned int batchStride) { @@ -66,7 +68,7 @@ MIOpenBatchNormFwdInferSpatialEst(const __global _FLOAT* __restrict in, /* x inp { for(int n = 0; n < batchSize; n++) { - index = (n * batchStride) + (xgid * imageDims) + idx; + index = (n * batchStride) + (xgid*cStride) + idx*cLen; inhat = (FLOAT2FLOATPREC(*(in + index)) - mean) * invVariance; out[index] = FLOATPREC2FLOAT(mad(pscale, inhat, pbias)); } diff --git a/src/solver/batchnorm/backward_spatial_multiple.cpp b/src/solver/batchnorm/backward_spatial_multiple.cpp index 60fbc203ce..657ab817cd 100644 --- a/src/solver/batchnorm/backward_spatial_multiple.cpp +++ b/src/solver/batchnorm/backward_spatial_multiple.cpp @@ -40,14 +40,13 @@ namespace batchnorm { bool BNBwdIsCaseVariant2(const miopen::batchnorm::ProblemDescription& problem) { - int n, c, h, w; + size_t n, c, h, w; std::tie(n, c, h, w) = tien<4>(problem.GetXDesc().GetLengths()); - unsigned int in_cstride = h * w; - unsigned int in_nhw = n * in_cstride; + size_t in_cstride = h * w; + size_t in_nhw = n * in_cstride; - if(!(in_nhw < (32 * 1024 * 1024) && in_cstride > 1024) && - !(in_nhw < (32 * 1024 * 1024) && in_cstride > 512) && !(in_cstride <= 512)) + if((in_nhw >= (32 * 1024 * 1024) || in_cstride <= 1024) && in_cstride > 512) { return true; } diff --git a/src/solver/batchnorm/forward_inference.cpp b/src/solver/batchnorm/forward_inference.cpp index 59893fe3f2..7aab643be7 100644 --- a/src/solver/batchnorm/forward_inference.cpp +++ b/src/solver/batchnorm/forward_inference.cpp @@ -41,8 +41,6 @@ namespace batchnorm { bool BnFwdInference::IsApplicable(const ExecutionContext&, const miopen::batchnorm::ProblemDescription& bn_problem) const { - if(!problem.IsLayoutNCHW()) - return false; if(bn_problem.GetDirection() != miopen::batchnorm::Direction::ForwardInference) return false; if(!(bn_problem.IsFp32() or bn_problem.IsFp16() or bn_problem.IsBFp16())) @@ -149,16 +147,36 @@ ConvSolution BnFwdInference::GetSolution(const ExecutionContext& context, unsigned int in_nstride_ = c_ * h_ * w_; unsigned int in_cstride_ = h_ * w_; - kernel(params.x, - params.y, - params.estimatedMean, - params.estimatedVariance, - params.bnScale, - params.bnBias, - params.epsilon, - n_, - in_cstride_, - in_nstride_); + if(params.xDesc->GetLayout_t() == miopenTensorNHWC) + { + kernel(params.x, + params.y, + params.estimatedMean, + params.estimatedVariance, + params.bnScale, + params.bnBias, + params.epsilon, + n_, + c_, // nhwc = c + 1, + in_cstride_, + in_nstride_); + } + else + { + kernel(params.x, + params.y, + params.estimatedMean, + params.estimatedVariance, + params.bnScale, + params.bnBias, + params.epsilon, + n_, + 1, // nchw 1 + h_ * w_, + in_cstride_, + in_nstride_); + } }; }; diff --git a/src/solver/batchnorm/forward_spatial_multiple.cpp b/src/solver/batchnorm/forward_spatial_multiple.cpp index cd7adaa72e..1470225fbc 100644 --- a/src/solver/batchnorm/forward_spatial_multiple.cpp +++ b/src/solver/batchnorm/forward_spatial_multiple.cpp @@ -43,20 +43,20 @@ namespace batchnorm { bool BNFwdTrainIsCaseVariant2(const miopen::batchnorm::ProblemDescription& problem) { const auto& xDesc = problem.GetXDesc(); - int n, c, h, w; - std::tie(n, c, h, w) = tien<4>(xDesc.GetLengths()); - unsigned int in_cstride = h * w; - unsigned int in_nhw = n * in_cstride; - bool bfp32parm = xDesc.GetType() == miopenFloat; - bool bfpmixparm = (xDesc.GetType() == miopenHalf || xDesc.GetType() == miopenBFloat16) && + size_t n, c, h, w; + std::tie(n, c, h, w) = tien<4>(xDesc.GetLengths()); + size_t in_cstride = h * w; + size_t in_nhw = n * in_cstride; + bool bfp32parm = xDesc.GetType() == miopenFloat; + bool bfpmixparm = (xDesc.GetType() == miopenHalf || xDesc.GetType() == miopenBFloat16) && problem.GetBnScale().GetType() == miopenFloat; // NCHW is Applicable for variant = 2 only - if((!(n < 3) && - !((in_nhw < 33554432 && in_cstride > 1024) || - ((n >= 256) && (in_cstride > 60) && bfpmixparm) || ((in_cstride > 512) && bfpmixparm)) && - !(in_cstride <= 512)) || - !((n > 768) && (in_cstride > 150) && bfp32parm)) + // these number comes from BnFwdTrainingSpatialMultiple::GetSolution of + // forward_spatial_multiple.cpp + if((n >= 3 && in_cstride > 512 && (in_nhw >= 33554432 || in_cstride <= 1024) && + ((n < 256) || (in_cstride <= 60) || !bfpmixparm) && (!bfpmixparm || in_cstride <= 512)) || + (n <= 768 || in_cstride <= 150 || !bfp32parm)) { return true; } From 38486ec5ac9ae30b61dcdfd83df270198f0e16d6 Mon Sep 17 00:00:00 2001 From: Bibek Ghimire Date: Fri, 22 Nov 2024 14:27:47 +0000 Subject: [PATCH 04/12] fix clang format --- src/kernels/MIOpenBatchNormFwdInferSpatial.cl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/kernels/MIOpenBatchNormFwdInferSpatial.cl b/src/kernels/MIOpenBatchNormFwdInferSpatial.cl index 638ef47828..24e268fd0d 100644 --- a/src/kernels/MIOpenBatchNormFwdInferSpatial.cl +++ b/src/kernels/MIOpenBatchNormFwdInferSpatial.cl @@ -68,7 +68,7 @@ MIOpenBatchNormFwdInferSpatialEst(const __global _FLOAT* __restrict in, /* x inp { for(int n = 0; n < batchSize; n++) { - index = (n * batchStride) + (xgid*cStride) + idx*cLen; + index = (n * batchStride) + (xgid * cStride) + idx * cLen; inhat = (FLOAT2FLOATPREC(*(in + index)) - mean) * invVariance; out[index] = FLOATPREC2FLOAT(mad(pscale, inhat, pbias)); } From 7e21c04074ada55565cfc1ec2fa5254befa24198 Mon Sep 17 00:00:00 2001 From: Bibek Ghimire Date: Fri, 22 Nov 2024 15:05:13 +0000 Subject: [PATCH 05/12] fix hip tidy issue --- src/solver/batchnorm/backward_spatial_multiple.cpp | 2 +- src/solver/batchnorm/forward_spatial_multiple.cpp | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/solver/batchnorm/backward_spatial_multiple.cpp b/src/solver/batchnorm/backward_spatial_multiple.cpp index 657ab817cd..3a965df636 100644 --- a/src/solver/batchnorm/backward_spatial_multiple.cpp +++ b/src/solver/batchnorm/backward_spatial_multiple.cpp @@ -46,7 +46,7 @@ bool BNBwdIsCaseVariant2(const miopen::batchnorm::ProblemDescription& problem) size_t in_cstride = h * w; size_t in_nhw = n * in_cstride; - if((in_nhw >= (32 * 1024 * 1024) || in_cstride <= 1024) && in_cstride > 512) + if((in_nhw >= (32u * 1024 * 1024) || in_cstride <= 1024) && in_cstride > 512) { return true; } diff --git a/src/solver/batchnorm/forward_spatial_multiple.cpp b/src/solver/batchnorm/forward_spatial_multiple.cpp index 1470225fbc..5457184701 100644 --- a/src/solver/batchnorm/forward_spatial_multiple.cpp +++ b/src/solver/batchnorm/forward_spatial_multiple.cpp @@ -69,13 +69,13 @@ bool BnFwdTrainingSpatialMultiple::IsApplicable( { if(!problem.IsLayoutNCHW()) return false; - + // if NCHW check if variant is 2 else false (for all data type) + // update get solution to not change variant if(!BNFwdTrainIsCaseVariant2(problem)) { return false; } - // if NCHW check if variant is 2 else false (for all data type) - // update get solution to not change variant + if(problem.GetDirection() != miopen::batchnorm::Direction::ForwardTraining || problem.GetMode() != miopenBNSpatial) return false; From dcc58f866a09f596e4b2d9fd2b0a5e3cbe860bdf Mon Sep 17 00:00:00 2001 From: Bibek Ghimire Date: Fri, 22 Nov 2024 15:40:06 +0000 Subject: [PATCH 06/12] hip cleanups --- src/solver/batchnorm/backward_spatial_multiple.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/solver/batchnorm/backward_spatial_multiple.cpp b/src/solver/batchnorm/backward_spatial_multiple.cpp index 3a965df636..2fa80fe145 100644 --- a/src/solver/batchnorm/backward_spatial_multiple.cpp +++ b/src/solver/batchnorm/backward_spatial_multiple.cpp @@ -46,7 +46,7 @@ bool BNBwdIsCaseVariant2(const miopen::batchnorm::ProblemDescription& problem) size_t in_cstride = h * w; size_t in_nhw = n * in_cstride; - if((in_nhw >= (32u * 1024 * 1024) || in_cstride <= 1024) && in_cstride > 512) + if((in_nhw >= static_cast(32 * 1024 * 1024) || in_cstride <= 1024) && in_cstride > 512) { return true; } From 281b230f03a50aa29998924739a95a677f8926c7 Mon Sep 17 00:00:00 2001 From: Bibek Ghimire Date: Fri, 22 Nov 2024 18:00:18 +0000 Subject: [PATCH 07/12] fix index for nchw and nhwc for MIOpenBatchNormFwdInferPerAct.cl --- src/kernels/MIOpenBatchNormFwdInferPerAct.cl | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/kernels/MIOpenBatchNormFwdInferPerAct.cl b/src/kernels/MIOpenBatchNormFwdInferPerAct.cl index f516a076ea..a505fd6b2a 100644 --- a/src/kernels/MIOpenBatchNormFwdInferPerAct.cl +++ b/src/kernels/MIOpenBatchNormFwdInferPerAct.cl @@ -43,6 +43,8 @@ MIOpenBatchNormFwdInferPerActivationEst(const __global _FLOAT* in, const __global _FLOAT_PREC* __restrict bias, double epsilon, unsigned int batchSize, + unsigned int cLen, + unsigned int cStride, unsigned int imageDims, unsigned int batchStride) { @@ -58,7 +60,7 @@ MIOpenBatchNormFwdInferPerActivationEst(const __global _FLOAT* in, for(int img_offset = ygid; img_offset < imageDims; img_offset += yglb_sz) { - adjIndex = (grpid * imageDims) + img_offset; + adjIndex = (grpid * cStride) + img_offset * cLen; mean = estimatedMean[adjIndex]; variance = estimatedVariance[adjIndex]; invVariance = rsqrt(fabs(variance + epsilon)); From 12a8920d2769d1a8b0e8cc304eec21bed83bb25d Mon Sep 17 00:00:00 2001 From: Bibek Ghimire Date: Fri, 13 Dec 2024 15:40:09 +0000 Subject: [PATCH 08/12] disable ck bn solver --- src/ocl/batchnormocl.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/ocl/batchnormocl.cpp b/src/ocl/batchnormocl.cpp index 956ac99e38..33a8243789 100644 --- a/src/ocl/batchnormocl.cpp +++ b/src/ocl/batchnormocl.cpp @@ -153,8 +153,8 @@ void BatchNormForwardTraining(Handle& handle, const auto solvers = solver::SolverContainer{}; + solver::batchnorm::BnFwdTrainingPerActivation>{}; + // solver::batchnorm::BnCKFwdTraining>{}; solvers.ExecutePrimitive(handle, problem, algo, invoke_params); @@ -250,8 +250,8 @@ void BatchNormForwardInference(Handle& handle, }(); const auto algo = AlgorithmName{"miopenBatchNormalizationForwardInference"}; - const auto solvers = solver::SolverContainer{}; + const auto solvers = solver::SolverContainer{}; + // solver::batchnorm::BnCKFwdInference>{}; solvers.ExecutePrimitive(handle, problem, algo, invoke_params); } @@ -395,8 +395,8 @@ void BatchNormBackward(Handle& handle, const auto solvers = solver::SolverContainer{}; + solver::batchnorm::BnBwdTrainingPerActivation>{}; + // solver::batchnorm::BnCKBwdBackward>{}; solvers.ExecutePrimitive(handle, problem, algo, invoke_params); From 4f08f319a2711e66592974909121c54849ac9541 Mon Sep 17 00:00:00 2001 From: Bibek Ghimire Date: Thu, 19 Dec 2024 06:31:27 +0000 Subject: [PATCH 09/12] bg/enable_nhwc_in_ocl : fix tensor initialization of driver. Make it similar to gtest --- driver/bn_driver.hpp | 69 ++++++++++++++++++++----------------- test/gtest/bn_test_data.hpp | 27 +++++++++------ 2 files changed, 54 insertions(+), 42 deletions(-) diff --git a/driver/bn_driver.hpp b/driver/bn_driver.hpp index bfa1b91aef..80a43d9e38 100644 --- a/driver/bn_driver.hpp +++ b/driver/bn_driver.hpp @@ -195,10 +195,10 @@ int BatchNormDriver::GetandSetData() SetBNParametersFromCmdLineArgs(); in.AllocOnHost(tensor{bn_layout, in_len}); - for(size_t i = 0; i < in.GetVector().size(); i++) - { - in.GetVector()[i] = prng::gen_canonical(); - } + // 0.0 to 2.0 (since unsigned) + in.GetTensor().generate([](auto...) { + return prng::gen_descreet_unsigned(2e-3 /*scale*/, 1000 /*range*/); + }); auto derivedBnDesc = miopen::TensorDescriptor{}; miopen::DeriveBNTensorDescriptor(derivedBnDesc, in.GetTensor().desc, bn_mode); @@ -208,20 +208,25 @@ int BatchNormDriver::GetandSetData() out.AllocOnHost(tensor{bn_layout, in_len}); scale.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); bias.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); - - for(int i = 0; i < scale.GetVector().size(); i++) - { - scale.GetVector()[i] = prng::gen_canonical(); - bias.GetVector()[i] = prng::gen_canonical(); - } + // -2.0 to 2.0 + scale.GetTensor().generate([](auto...) { + return prng::gen_descreet_uniform_sign(2e-3 /*scale*/, 1000 /*range*/); + }); + bias.GetTensor().generate([](auto...) { + return prng::gen_descreet_uniform_sign(2e-3 /*scale*/, 1000 /*range*/); + }); } if(isFwdInfer) { estMean.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); estVariance.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); - auto gen_value_emean = [](auto...) { return prng::gen_descreet_unsigned(1e-2, 100); }; - estMean.InitHostData(estMean.GetTensor().desc.GetElementSize(), true, gen_value_emean); + // 0.0 to 1.0 + estMean.InitHostData(estMean.GetTensor().desc.GetElementSize(), true, [](auto...) { + return prng::gen_descreet_uniform_sign(2e-3 /*scale*/, 1000 /*range*/); + }); + estVariance.GetTensor().generate( + [](auto...) { return static_cast(2e-3 * (prng::gen_0_to_B(1000) + 1)); }); } else if(isFwdTrain) { @@ -230,11 +235,13 @@ int BatchNormDriver::GetandSetData() runMean.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); runVariance.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); - for(int i = 0; i < runVariance.GetVector().size(); i++) - { - runMean.GetVector()[i] = prng::gen_canonical(); - runVariance.GetVector()[i] = prng::gen_canonical(); - } + // -2.0 to 2.0 + runMean.GetTensor().generate([](auto...) { + return prng::gen_descreet_uniform_sign(2e-3 /*scale*/, 1000 /*range*/); + }); + runVariance.GetTensor().generate([](auto...) { + return prng::gen_descreet_uniform_sign(2e-3 /*scale*/, 1000 /*range*/); + }); } else if(isBwd) { @@ -242,33 +249,33 @@ int BatchNormDriver::GetandSetData() bnScale.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); dy.AllocOnHost(tensor{bn_layout, in_len}); - - auto gen_var_bwd = [](auto...) { - return static_cast(1e-2 * (prng::gen_0_to_B(100) + 1)); - }; - - dy.InitHostData(dy.GetTensor().desc.GetElementSize(), true, gen_var_bwd); + // -2.0 to 2.0 + dy.InitHostData(dy.GetTensor().desc.GetElementSize(), true, [](auto...) { + return prng::gen_descreet_uniform_sign(2e-3, 1000); + }); dScale.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); dBias.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); savedMean.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); savedInvVar.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); - auto gen_value = [](auto...) { return prng::gen_descreet_unsigned(1e-2, 100); }; - bnScale.InitHostData(bnScale.GetTensor().desc.GetElementSize(), true, gen_value); - - auto gen_in_var = [](auto...) { - return static_cast(1e-2 * (prng::gen_0_to_B(100) + 1)); + auto gen_value_bnScale = [](auto...) { + return prng::gen_descreet_uniform_sign(2e-3, 1000); }; - savedMean.InitHostData(savedMean.GetTensor().desc.GetElementSize(), true, gen_in_var); - savedInvVar.InitHostData(savedInvVar.GetTensor().desc.GetElementSize(), true, gen_in_var); + bnScale.InitHostData(bnScale.GetTensor().desc.GetElementSize(), true, gen_value_bnScale); + // -2.0 to 2.0 + savedMean.InitHostData(savedMean.GetTensor().desc.GetElementSize(), true, [](auto...) { + return prng::gen_descreet_uniform_sign(2e-3, 1000); + }); + savedInvVar.InitHostData(savedInvVar.GetTensor().desc.GetElementSize(), true, [](auto...) { + return prng::gen_descreet_uniform_sign(2e-3, 1000); + }); } else { std::cout << "\nUnknown batch norm state!\n"; exit(EXIT_FAILURE); } - return miopenStatusSuccess; } diff --git a/test/gtest/bn_test_data.hpp b/test/gtest/bn_test_data.hpp index d3b1c6b073..f8f35a47c2 100644 --- a/test/gtest/bn_test_data.hpp +++ b/test/gtest/bn_test_data.hpp @@ -67,6 +67,7 @@ inline std::vector NetworkLarge() // pyt_mlperf_resnet50v1.5 return { {192, 1, 8, 8, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 1, 0}, + {12, 40, 122, 122, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 1, 0}, {64, 2048, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, {64, 2048, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, {64, 2048, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, @@ -101,7 +102,7 @@ inline std::vector NetworkSmall() { // pyt_mlperf_resnet50v1.5 return { - {192, 2, 8, 8, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 1, 0}, + {12, 40, 122, 122, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 1, 0}, {16, 8, 132, 28, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 1, 0}, {16, 8, 128, 256, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 0}, {64, 2048, 17, 17, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, @@ -148,8 +149,8 @@ struct BNTestData void InitTensorsWithRandValue() { - input.generate( - [](auto...) { return prng::gen_descreet_uniform_sign(1e-2, 100); }); + // 0.0 to 2.0 (since unsigned) + input.generate([](auto...) { return prng::gen_descreet_unsigned(2e-3 /*scale*/, 1000 /*range*/); }); } void SetDirection() { direction = bn_config.Direction; } @@ -212,15 +213,17 @@ struct BNInferTestData : public BNTestData void InitTensorsWithRandValue() { + // -2.0 to 2.0 auto gen_value = [](auto...) { - return prng::gen_descreet_uniform_sign(1e-2, 100); + return prng::gen_descreet_uniform_sign(2e-3, 1000); }; scale.generate(gen_value); shift.generate(gen_value); estMean.generate(gen_value); + // 0.0 to 2.0 auto gen_var = [](auto...) { - return static_cast(1e-2 * (prng::gen_0_to_B(100) + 1)); + return static_cast(2e-3 * (prng::gen_0_to_B(1000) + 1)); }; estVariance.generate(gen_var); } @@ -303,15 +306,16 @@ struct BNBwdTestData : public BNTestData void InitTensorsWithRandValue() { + // -2.0 to 2.0 auto gen_value = [](auto...) { - return prng::gen_descreet_uniform_sign(1e-2, 100); + return prng::gen_descreet_uniform_sign(2e-3, 1000); }; dy.generate(gen_value); bnScale.generate(gen_value); savedMean.generate(gen_value); - + // 0.0 to 2.0 auto gen_var = [](auto...) { - return static_cast(1e-2 * (prng::gen_0_to_B(100) + 1)); + return static_cast(2e-3 * (prng::gen_0_to_B(1000) + 1)); }; savedInvVar.generate(gen_var); @@ -400,14 +404,15 @@ struct BNFwdTrainTestData : public BNTestData void InitTensorsWithRandValue() { + // -2.0 to 2.0 auto gen_value = [](auto...) { - return prng::gen_descreet_uniform_sign(1e-2, 100); + return prng::gen_descreet_uniform_sign(2e-3, 1000); }; scale.generate(gen_value); shift.generate(gen_value); - + // 0.0 to 2.0 auto gen_var = [](auto...) { - return static_cast(1e-2 * (prng::gen_0_to_B(100) + 1)); + return static_cast(2e-3 * (prng::gen_0_to_B(1000) + 1)); }; runMean.generate(gen_var); runVariance.generate(gen_var); From aa999e0cd343d66b03a9abc6d1e075aeb008befa Mon Sep 17 00:00:00 2001 From: Bibek Ghimire Date: Thu, 19 Dec 2024 12:33:42 +0000 Subject: [PATCH 10/12] fix clang format --- test/gtest/bn_test_data.hpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/test/gtest/bn_test_data.hpp b/test/gtest/bn_test_data.hpp index f8f35a47c2..36c01c5712 100644 --- a/test/gtest/bn_test_data.hpp +++ b/test/gtest/bn_test_data.hpp @@ -150,7 +150,9 @@ struct BNTestData void InitTensorsWithRandValue() { // 0.0 to 2.0 (since unsigned) - input.generate([](auto...) { return prng::gen_descreet_unsigned(2e-3 /*scale*/, 1000 /*range*/); }); + input.generate([](auto...) { + return prng::gen_descreet_unsigned(2e-3 /*scale*/, 1000 /*range*/); + }); } void SetDirection() { direction = bn_config.Direction; } From bcc0ae72d812a41ca57b51d5bdfc8f6023ff02dd Mon Sep 17 00:00:00 2001 From: Bibek Ghimire Date: Thu, 19 Dec 2024 16:59:01 +0000 Subject: [PATCH 11/12] fix type issue --- driver/bn_driver.hpp | 8 ++++---- test/gtest/bn_test_data.hpp | 38 ++++++++++++++++++------------------- 2 files changed, 22 insertions(+), 24 deletions(-) diff --git a/driver/bn_driver.hpp b/driver/bn_driver.hpp index 80a43d9e38..e56680b806 100644 --- a/driver/bn_driver.hpp +++ b/driver/bn_driver.hpp @@ -210,10 +210,10 @@ int BatchNormDriver::GetandSetData() bias.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); // -2.0 to 2.0 scale.GetTensor().generate([](auto...) { - return prng::gen_descreet_uniform_sign(2e-3 /*scale*/, 1000 /*range*/); + return prng::gen_descreet_uniform_sign(2e-3 /*scale*/, 1000 /*range*/); }); bias.GetTensor().generate([](auto...) { - return prng::gen_descreet_uniform_sign(2e-3 /*scale*/, 1000 /*range*/); + return prng::gen_descreet_uniform_sign(2e-3 /*scale*/, 1000 /*range*/); }); } if(isFwdInfer) @@ -237,10 +237,10 @@ int BatchNormDriver::GetandSetData() // -2.0 to 2.0 runMean.GetTensor().generate([](auto...) { - return prng::gen_descreet_uniform_sign(2e-3 /*scale*/, 1000 /*range*/); + return prng::gen_descreet_uniform_sign(2e-3 /*scale*/, 1000 /*range*/); }); runVariance.GetTensor().generate([](auto...) { - return prng::gen_descreet_uniform_sign(2e-3 /*scale*/, 1000 /*range*/); + return prng::gen_descreet_uniform_sign(2e-3 /*scale*/, 1000 /*range*/); }); } else if(isBwd) diff --git a/test/gtest/bn_test_data.hpp b/test/gtest/bn_test_data.hpp index 36c01c5712..c22acd7f0e 100644 --- a/test/gtest/bn_test_data.hpp +++ b/test/gtest/bn_test_data.hpp @@ -216,12 +216,12 @@ struct BNInferTestData : public BNTestData void InitTensorsWithRandValue() { // -2.0 to 2.0 - auto gen_value = [](auto...) { - return prng::gen_descreet_uniform_sign(2e-3, 1000); - }; - scale.generate(gen_value); - shift.generate(gen_value); - estMean.generate(gen_value); + scale.generate( + [](auto...) { return prng::gen_descreet_uniform_sign(2e-3, 1000); }); + shift.generate( + [](auto...) { return prng::gen_descreet_uniform_sign(2e-3, 1000); }); + estMean.generate( + [](auto...) { return prng::gen_descreet_uniform_sign(2e-3, 1000); }); // 0.0 to 2.0 auto gen_var = [](auto...) { @@ -309,17 +309,16 @@ struct BNBwdTestData : public BNTestData void InitTensorsWithRandValue() { // -2.0 to 2.0 - auto gen_value = [](auto...) { - return prng::gen_descreet_uniform_sign(2e-3, 1000); - }; - dy.generate(gen_value); - bnScale.generate(gen_value); - savedMean.generate(gen_value); + dy.generate( + [](auto...) { return prng::gen_descreet_uniform_sign(2e-3, 1000); }); + bnScale.generate( + [](auto...) { return prng::gen_descreet_uniform_sign(2e-3, 1000); }); + savedMean.generate( + [](auto...) { return prng::gen_descreet_uniform_sign(2e-3, 1000); }); // 0.0 to 2.0 - auto gen_var = [](auto...) { + savedInvVar.generate([](auto...) { return static_cast(2e-3 * (prng::gen_0_to_B(1000) + 1)); - }; - savedInvVar.generate(gen_var); + }); std::fill(dScale.begin(), dScale.end(), 0.); std::fill(dBias.begin(), dBias.end(), 0.); @@ -407,11 +406,10 @@ struct BNFwdTrainTestData : public BNTestData void InitTensorsWithRandValue() { // -2.0 to 2.0 - auto gen_value = [](auto...) { - return prng::gen_descreet_uniform_sign(2e-3, 1000); - }; - scale.generate(gen_value); - shift.generate(gen_value); + scale.generate( + [](auto...) { return prng::gen_descreet_uniform_sign(2e-3, 1000); }); + shift.generate( + [](auto...) { return prng::gen_descreet_uniform_sign(2e-3, 1000); }); // 0.0 to 2.0 auto gen_var = [](auto...) { return static_cast(2e-3 * (prng::gen_0_to_B(1000) + 1)); From c853342ab99588f4dd7d630530f16e4167458e07 Mon Sep 17 00:00:00 2001 From: Bibek Ghimire Date: Fri, 20 Dec 2024 02:43:30 +0000 Subject: [PATCH 12/12] remove ncwh guard for fwd train multiple --- src/solver/batchnorm/forward_spatial_multiple.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/solver/batchnorm/forward_spatial_multiple.cpp b/src/solver/batchnorm/forward_spatial_multiple.cpp index 5457184701..a7a0f871ac 100644 --- a/src/solver/batchnorm/forward_spatial_multiple.cpp +++ b/src/solver/batchnorm/forward_spatial_multiple.cpp @@ -67,8 +67,6 @@ bool BNFwdTrainIsCaseVariant2(const miopen::batchnorm::ProblemDescription& probl bool BnFwdTrainingSpatialMultiple::IsApplicable( const ExecutionContext& context, const miopen::batchnorm::ProblemDescription& problem) const { - if(!problem.IsLayoutNCHW()) - return false; // if NCHW check if variant is 2 else false (for all data type) // update get solution to not change variant if(!BNFwdTrainIsCaseVariant2(problem))