diff --git a/driver/bn_driver.hpp b/driver/bn_driver.hpp index 08bc5bb693..98c287ac78 100644 --- a/driver/bn_driver.hpp +++ b/driver/bn_driver.hpp @@ -45,7 +45,7 @@ #define MIO_BN_DEBUG 1 #define MIO_BN_MAX_DEBUGLOOP 65536 -#define EPSILON 1e-6 +#define EPSILON 1e-4 #define ERRTOL 1e-6 #define RMSTOL 1e-6 @@ -1286,8 +1286,8 @@ int BatchNormDriver::VerifyBackward() if(!back) return miopenStatusSuccess; - const double tolerance = ERRTOL; - const double maxrms = RMSTOL; + const double tolerance = ERRTOL * 1000; + const double maxrms = RMSTOL * 1000; double diff = 0.; bool anError = false; diff --git a/driver/miopen_BatchNormHost.hpp b/driver/miopen_BatchNormHost.hpp index 14ff4b2263..a99f6a014c 100644 --- a/driver/miopen_BatchNormHost.hpp +++ b/driver/miopen_BatchNormHost.hpp @@ -30,7 +30,7 @@ #include #include -#define MIO_HEIRARCH_SEL 0 +#define MIO_HEIRARCH_SEL 1 #if(MIO_HEIRARCH_SEL == 1) #define MIO_BN_DIST 32 @@ -38,8 +38,9 @@ template int miopenBNFwdTrainPerActivationRunHost( - /* T alpha, - T beta, + /* + T alpha, + T beta, */ int n_batchs, int channels, @@ -743,7 +744,6 @@ int miopenBNBwdPerActivationRunHost( dxhat += tmp1; dxhathat += tmp1 * xhat[xhat_index]; } // end for(n_batchs) - dscale_ptr[adjIndex] /= double(n_batchs); for(int bidx = 0; bidx < n_batchs; bidx++) { // via mini_batch @@ -812,7 +812,6 @@ int miopenBNBwdPerActivationRunHost( dxhat += tmp1; dxhathat += tmp1 * xhat[xhat_index]; } // end for(n_batchs) - dscale_ptr[adjIndex] /= double(n_batchs); for(int bidx = 0; bidx < n_batchs; bidx++) { // via mini_batch @@ -891,7 +890,7 @@ int miopenBNBwdSpatialRunHost( } // end for(n_batchs) } // for (column) } // for (row) - dscale_ptr[cidx] /= NHW; + // process the batch per channel for(int row = 0; row < height; row++) { // via rows @@ -1087,17 +1086,15 @@ int miopenBNBwdSpatialRunHost( } #endif - dscale_ptr[cidx] /= NHW; // printf("dscale: %f\n",dscale_ptr[cidx]); // printf("dbias: %f\n",dbias_ptr[cidx]); -// printf("HELLO BASTARDS!!!"); + #if(MIO_HEIRARCH_SEL == 0) for(int row = 0; row < height; row++) { // via rows for(int column = 0; column < width; column++) { // via columns adjIndex = Csubindex + width * row + column; - for(int bidx = 0; bidx < n_batchs; bidx++) { // via mini_batch index = in_nstride * bidx + adjIndex; diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h index a7b4462dc2..68af8a3f39 100644 --- a/include/miopen/miopen.h +++ b/include/miopen/miopen.h @@ -376,8 +376,8 @@ MIOPEN_EXPORT miopenStatus_t miopenGet4dTensorDescriptor(miopenTensorDescriptor_ * @param tensorDesc Tensor descriptor type (input) * @param dataType Currently only miopenFloat is implemented (input) * @param nbDims Number of dimensions in the dimsA array (input) - * @param dimsA Array containing the size of dimensions (output) - * @param stridesA Array containing the size of stride (output) + * @param dimsA Array containing the size of dimensions (input) + * @param stridesA Array containing the size of stride (input) * @return miopenStatus_t */ MIOPEN_EXPORT miopenStatus_t miopenSetTensorDescriptor(miopenTensorDescriptor_t tensorDesc, diff --git a/src/batch_norm.cpp b/src/batch_norm.cpp index 2d8585f154..0106925562 100644 --- a/src/batch_norm.cpp +++ b/src/batch_norm.cpp @@ -27,6 +27,8 @@ #include #include +#define MIOPEN_BN_SYNCH 0 + namespace miopen { void DeriveBNTensorDescriptor(TensorDescriptor& derivedBnDesc, @@ -73,7 +75,9 @@ inline void profileSequence(Handle& handle, unsigned char select) } else { +#if(MIOPEN_BN_SYNCH) handle.Finish(); +#endif } break; case 1: @@ -89,7 +93,9 @@ inline void profileSequence(Handle& handle, unsigned char select) } else { +#if(MIOPEN_BN_SYNCH) handle.Finish(); +#endif } break; @@ -382,7 +388,7 @@ void bnBwdTrainSelectMulti(Handle& handle, kernel_subname = kernel_name + "FinalDScale"; handle.GetKernel(algo_name, network_config, program_name, kernel_subname, vld, vgd, parms)( - dx, dScale, inhw); + dx, dScale); profileSequence(handle, 1); kernel_subname = kernel_name + "DX"; @@ -433,7 +439,7 @@ void bnBwdTrainSelectMulti(Handle& handle, kernel_subname = kernel_name + "FinalDScale"; handle.GetKernel(algo_name, network_config, program_name, kernel_subname, vld, vgd, parms)( - dx, dScale, inhw); + dx, dScale); profileSequence(handle, 1); kernel_subname = kernel_name + "DX"; diff --git a/src/kernels/MIOpenBatchNormBwdPerAct.cl b/src/kernels/MIOpenBatchNormBwdPerAct.cl index 68ce1a24a4..7568087fb3 100644 --- a/src/kernels/MIOpenBatchNormBwdPerAct.cl +++ b/src/kernels/MIOpenBatchNormBwdPerAct.cl @@ -146,7 +146,6 @@ __kernel void BatchNormBwdPerActivationSaved(const __global _FLOAT* x_in, dxhat += tmp1; dxhathat = mad(tmp1, xhat, dxhathat); } // end for(n) - pvt_dscale /= (_FLOAT)N; for(int n = 0; n < N; n++) { @@ -246,7 +245,6 @@ __kernel void BatchNormBwdPerActivation(const __global _FLOAT* x_in, dxhat += tmp1; dxhathat = mad(tmp1, xhat, dxhathat); } // end for(n) - pvt_dscale /= (_FLOAT)N; for(int n = 0; n < N; n++) { diff --git a/src/kernels/MIOpenBatchNormBwdSpatial.cl b/src/kernels/MIOpenBatchNormBwdSpatial.cl index 962b3d81be..f0559c76b3 100644 --- a/src/kernels/MIOpenBatchNormBwdSpatial.cl +++ b/src/kernels/MIOpenBatchNormBwdSpatial.cl @@ -101,9 +101,11 @@ #undef __AMDGCN__ #endif -//#ifdef __AMDGCN__ -//#undef __AMDGCN__ -//#endif +/* +#ifdef __AMDGCN__ +#undef __AMDGCN__ +#endif +*/ // Disable specific warnings #ifdef __clang__ @@ -409,7 +411,7 @@ BatchNormBwdSpatial(const __global _FLOAT* __restrict x_in, lcl_data[lid] += lcl_data[lid + red]; barrier(CLK_LOCAL_MEM_FENCE); } - dppLDSReduce64(&ds, lcl_data, lid, INHW); + dppLDSReduce64(&ds, lcl_data, lid, 1); #else for(unsigned int red = (MIO_BN_GRP0 >> 1); red > 256; red >>= 1) { @@ -417,7 +419,7 @@ BatchNormBwdSpatial(const __global _FLOAT* __restrict x_in, lcl_data[lid] += lcl_data[lid + red]; barrier(CLK_LOCAL_MEM_FENCE); } - regLDSreduce(&ds, lcl_data, lid, INHW); + regLDSreduce(&ds, lcl_data, lid, 1); #endif if(lid < MIO_BN_SEGMENT) @@ -649,16 +651,14 @@ BatchNormBwdSpatial(const __global _FLOAT* __restrict x_in, #ifdef __AMDGCN__ #if(MIO_BN_HW > 16) - dppRegReduce64(&ds, INHW); + dppRegReduce64(&ds, 1); #elif(MIO_BN_HW > 1) - dppRegReduce16(&ds, INHW); -#else - ds *= INHW; + dppRegReduce16(&ds, 1); #endif // HW #else // if not GCN #if(MIO_BN_HW > 16) - regLDSreduce(&ds, lcl_data, ylid, INHW); + regLDSreduce(&ds, lcl_data, ylid, 1); #elif(MIO_BN_HW > 1) lcl_data[ylid] = ds; barrier(CLK_LOCAL_MEM_FENCE); @@ -668,9 +668,6 @@ BatchNormBwdSpatial(const __global _FLOAT* __restrict x_in, { ds += lcl_data[i]; } - ds *= INHW; -#else - ds *= INHW; #endif // HW #endif // GCN //=========================================== @@ -686,7 +683,7 @@ BatchNormBwdSpatial(const __global _FLOAT* __restrict x_in, { index = n * MIO_BN_CHW + cidx + ylid; tmp1 = mad(NHW, dyvalues[n], -db); - tmp2 = -batchvalues[n] * ds; + tmp2 = -(batchvalues[n]) * ds; tmp3 = (pscale * invVar) * INHW; dx_out[index] = tmp3 * (tmp2 + tmp1); } @@ -901,7 +898,7 @@ BatchNormBwdSpatial(const __global _FLOAT* __restrict x_in, #else // GCN #if(MIO_BN_N > 16) - regLDSreduce(&db, lcl_data, ylid, INHW); + regLDSreduce(&db, lcl_data, ylid, 1); #elif(MIO_BN_N > 1) lcl_data[ylid] = db; barrier(CLK_LOCAL_MEM_FENCE); @@ -917,7 +914,7 @@ BatchNormBwdSpatial(const __global _FLOAT* __restrict x_in, #ifdef __AMDGCN__ #if(MIO_BN_N > 16) - dppRegReduce64(&ds, INHW); + dppRegReduce64(&ds, 1); #elif(MIO_BN_N > 1) lcl_data[ylid] = ds; barrier(CLK_LOCAL_MEM_FENCE); @@ -927,14 +924,11 @@ BatchNormBwdSpatial(const __global _FLOAT* __restrict x_in, { ds += lcl_data[i]; } - ds *= INHW; -#else - ds *= INHW; #endif // N #else // if not GCN #if(MIO_BN_N > 16) - regLDSreduce(&ds, lcl_data, ylid, INHW); + regLDSreduce(&ds, lcl_data, ylid, 1); #elif(MIO_BN_N > 1) lcl_data[ylid] = ds; barrier(CLK_LOCAL_MEM_FENCE); @@ -944,9 +938,6 @@ BatchNormBwdSpatial(const __global _FLOAT* __restrict x_in, { ds += lcl_data[i]; } - ds *= INHW; -#else - ds *= INHW; #endif // HW #endif // GCN //=========================================== @@ -962,7 +953,7 @@ BatchNormBwdSpatial(const __global _FLOAT* __restrict x_in, { index = ylid * MIO_BN_CHW + cidx + hw; tmp1 = mad(NHW, dyvalues[hw], -db); - tmp2 = -batchvalues[hw] * ds; + tmp2 = -(batchvalues[hw]) * ds; tmp3 = (pscale * invVar) * INHW; dx_out[index] = tmp3 * (tmp2 + tmp1); } @@ -1231,7 +1222,7 @@ BatchNormBwdSpatial(const __global _FLOAT* __restrict x_in, barrier(CLK_LOCAL_MEM_FENCE); lcl_data[ylid] = ds; barrier(CLK_LOCAL_MEM_FENCE); - dppLDSReduce64(&ds, lcl_data, ylid, INHW); + dppLDSReduce64(&ds, lcl_data, ylid, 1); #else for(unsigned int red = (MIO_BN_GRP1 >> 1); red > 256; red >>= 1) @@ -1240,7 +1231,7 @@ BatchNormBwdSpatial(const __global _FLOAT* __restrict x_in, lcl_data[ylid] += lcl_data[ylid + red]; barrier(CLK_LOCAL_MEM_FENCE); } - regLDSreduce(&ds, lcl_data, ylid, INHW); + regLDSreduce(&ds, lcl_data, ylid, 1); #endif //=========================================== @@ -1256,7 +1247,7 @@ BatchNormBwdSpatial(const __global _FLOAT* __restrict x_in, index = n * MIO_BN_CHW + cidx + ylid; #if(MIO_BN_N < MIO_BN_MAXN) tmp1 = mad(NHW, dyvalues[n], -db); - tmp2 = -batchvalues[n] * ds; + tmp2 = -(batchvalues[n]) * ds; #else tmp1 = mad(NHW, dy_in[index], -db); tmp2 = -(x_in[index] - mean) * invVar * ds; @@ -1761,7 +1752,7 @@ BatchNormBwdSpatialDScale(const __global _FLOAT* x_in, unsigned int varstashindex = cidx + ygrp_sz * ygrp_id + 3; lmean = buff[meanstashindex]; // load stashed mean livar = buff[varstashindex]; -#else // SAVED +#else // NO SAVED lmean = savedMean[xgid]; livar = savedInvVariance[xgid]; #endif // SAVED @@ -1781,6 +1772,7 @@ BatchNormBwdSpatialDScale(const __global _FLOAT* x_in, elemStd = x_in[index] - mean; // (x_i - mean) xhat = elemStd * invVar; dscale = mad(xhat, dy_in[index], dscale); + // dscale += 1.; } // end for } // end if @@ -1809,7 +1801,6 @@ BatchNormBwdSpatialDScale(const __global _FLOAT* x_in, regLDSreduce(&dscale, lcl_data, ylid, 1); #endif // GCN - if(ylid == 0) { unsigned int gammaindex = cidx + ygrp_sz * ygrp_id + 4; @@ -1818,7 +1809,7 @@ BatchNormBwdSpatialDScale(const __global _FLOAT* x_in, } __attribute__((reqd_work_group_size(MIO_BN_GRP0, MIO_BN_GRP1, MIO_BN_GRP2))) __kernel void -BatchNormBwdSpatialFinalDScale(__global _FLOAT* buff, __global _FLOAT* delta_scale, _FLOAT INHW) +BatchNormBwdSpatialFinalDScale(__global _FLOAT* buff, __global _FLOAT* delta_scale) { __private _FLOAT ds = 0.; @@ -1852,7 +1843,7 @@ BatchNormBwdSpatialFinalDScale(__global _FLOAT* buff, __global _FLOAT* delta_sca lcl_data[ylid] += lcl_data[ylid + red]; barrier(CLK_LOCAL_MEM_FENCE); } - dppLDSReduce64(&ds, lcl_data, ylid, INHW); + dppLDSReduce64(&ds, lcl_data, ylid, 1); #else // GCN for(unsigned int red = (MIO_BN_GRP1 >> 1); red > 256; red >>= 1) { @@ -1860,21 +1851,21 @@ BatchNormBwdSpatialFinalDScale(__global _FLOAT* buff, __global _FLOAT* delta_sca lcl_data[ylid] += lcl_data[ylid + red]; barrier(CLK_LOCAL_MEM_FENCE); } - regLDSreduce(&ds, lcl_data, ylid, INHW); + regLDSreduce(&ds, lcl_data, ylid, 1); #endif // GCN #elif(MIO_BN_NGRPS <= 64) #ifdef __AMDGCN__ - dppRegReduce64(&ds, INHW); + dppRegReduce64(&ds, 1); #else // GCN __local _FLOAT lcl_data[MIO_BN_LDS_SIZE]; - regLDSreduce(&ds, lcl_data, ylid, INHW); + regLDSreduce(&ds, lcl_data, ylid, 1); #endif // GCN #else // else < 16 #ifdef __AMDGCN__ - dppRegReduce16(&ds, INHW); + dppRegReduce16(&ds, 1); #else // GCN __local _FLOAT lcl_data[MIO_BN_LDS_SIZE]; lcl_data[ylid] = ds; @@ -1885,7 +1876,6 @@ BatchNormBwdSpatialFinalDScale(__global _FLOAT* buff, __global _FLOAT* delta_sca { ds += lcl_data[i]; } - ds *= INHW; #endif // end AMDGCN #endif // NGRPS diff --git a/test/bn_peract_test.cpp b/test/bn_peract_test.cpp index cf1a112d96..76751bcb68 100644 --- a/test/bn_peract_test.cpp +++ b/test/bn_peract_test.cpp @@ -48,7 +48,7 @@ // Run CPU emulations in hierarchical reduction mode. //#define MIO_HEIRARCH_SEL 0 #define MIO_BN_TEST_EXPAVGFACTOR 0.1 -#define MIO_BN_TEST_EPSILON 1e-6 +#define MIO_BN_TEST_EPSILON 1e-5 //**************************************************** // FORWARD TRAIN @@ -577,7 +577,6 @@ struct verify_backward_bn_per_activation_use_saved dxhathat += tmp1 * xhat[xhat_index]; } // end for(n_batchs) - dscale(0, cidx, row, column) /= n; for(int bidx = 0; bidx < n_batch; bidx++) { // via mini_batch @@ -771,7 +770,6 @@ struct verify_backward_bn_per_activation_recalc dxhathat += tmp1 * xhat[xhat_index]; } // end for(n_batchs) - dscale(0, cidx, row, column) /= n; for(int bidx = 0; bidx < n_batch; bidx++) { // via mini_batch diff --git a/test/bn_spatial_test.cpp b/test/bn_spatial_test.cpp index 57c5349139..8c13486e4b 100644 --- a/test/bn_spatial_test.cpp +++ b/test/bn_spatial_test.cpp @@ -41,11 +41,11 @@ #include #include #include - +#include // Run CPU emulations in hierarchical reduction mode. #define MIO_HEIRARCH_SEL 1 #define MIO_BN_TEST_EXPAVGFACTOR 0.1 -#define MIO_BN_TEST_EPSILON 1e-6 +#define MIO_BN_TEST_EPSILON 1e-5 // FLT_EPSILON #define MIO_BN_SP_TEST_DEBUG 0 //**************************************************** @@ -717,6 +717,8 @@ struct verify_backward_bn_spatial_recalc dyelem = dy_input(bidx,cidx,row,column); dshift_accum_arr[row] += dyelem; dscale_accum_arr[row] += xhat[xhat_index]*dyelem; + //dscale_accum_arr[row] += x_input(bidx,cidx,row,column);;//dscale_accum_arr[row] += xhat[xhat_index]; + //dscale_accum_arr[row] += 1.0;//DEBUG } }// for (column) }// for (row) @@ -725,7 +727,6 @@ struct verify_backward_bn_spatial_recalc dscale(0,cidx,0,0) += dscale_accum_arr[i]; } #endif - dscale(0, cidx, 0, 0) /= nhw; for(int row = 0; row < height; row++) { // via rows @@ -925,9 +926,11 @@ struct verify_backward_bn_spatial_use_saved //per (x-dims) channel load a block of data into LDS elemStd = x_input(bidx,cidx,row,column) - mean;// (x_i - mean) xhat[xhat_index] = elemStd*invVar; + //printf("xhat[%d]: %lf\n",xhat_index,xhat[xhat_index]); dyelem = dy_input(bidx,cidx,row,column); dshift_accum_arr[row] += dyelem; dscale_accum_arr[row] += xhat[xhat_index]*dyelem; + //dscale_accum_arr[row] += 1.0;//DEBUG } }// for (column) }// for (row) @@ -936,7 +939,6 @@ struct verify_backward_bn_spatial_use_saved dscale(0,cidx,0,0) += dscale_accum_arr[i]; } #endif - dscale(0, cidx, 0, 0) /= nhw; for(int row = 0; row < height; row++) { // via rows @@ -1066,6 +1068,7 @@ struct batch_norm_spatial_driver : test_driver batch_norm_spatial_driver() { this->batch_factor = 4; + // this->verbose=true; add(input, "input", get_bn_spatial_input_tensor()); } @@ -1101,7 +1104,7 @@ struct batch_norm_spatial_driver : test_driver #if(MIO_BN_SP_TEST_DEBUG == 1) std::cout << "Running forward inference spatial recalc." << std::endl; #endif - + // this->tolerance = 80; // Debug values // std::fill(input.begin(), input.end(), 1); // std::fill(scale.begin(), scale.end(), 1); @@ -1118,7 +1121,19 @@ struct batch_norm_spatial_driver : test_driver // backprop recalc auto dy_input = std::get<0>(outpair.second); - + for(int bidx = 0; bidx < n; bidx++) + { // via mini_batch + for(int cidx = 0; cidx < c; cidx++) + { // via mini_batch + for(int row = 0; row < h; row++) + { // via rows + for(int column = 0; column < w; column++) + { + dy_input(bidx, cidx, row, column) *= 0.1; + } + } + } + } #if(MIO_BN_SP_TEST_DEBUG == 2) auto debugvals = verify(verify_backward_bn_spatial_recalc{input, dy_input, scale}); auto gpuout = std::get<0>(debugvals.second); @@ -1148,16 +1163,14 @@ struct batch_norm_spatial_driver : test_driver mh = row; mw = column; } - if(diff > 1.) - { - std::cout << "gpu[" << bidx << ", " << cidx << ", " << row << ", " - << column << "]: " << gpuout(bidx, cidx, row, column) - << " :: "; - std::cout << "cpu[" << bidx << ", " << cidx << ", " << row << ", " - << column << "]: " << cpuout(bidx, cidx, row, column) - << " :: "; - std::cout << "diff: " << diff << std::endl; - } + // if(diff > 1.) + // { + std::cout << "gpu[" << bidx << ", " << cidx << ", " << row << ", " << column + << "]: " << gpuout(bidx, cidx, row, column) << " :: "; + std::cout << "cpu[" << bidx << ", " << cidx << ", " << row << ", " << column + << "]: " << cpuout(bidx, cidx, row, column) << " :: "; + std::cout << "diff: " << diff << std::endl; + // } } } } @@ -1174,17 +1187,72 @@ struct batch_norm_spatial_driver : test_driver #if(MIO_BN_SP_TEST_DEBUG == 1) std::cout << "Running back propagation spatial recalc." << std::endl; #endif + this->tolerance = 80 * input.desc.GetElementSize(); verify(verify_backward_bn_spatial_recalc{input, dy_input, scale}); #endif // backprop use saved values auto savedMean = std::get<3>(outpair.second); auto savedInvVar = std::get<4>(outpair.second); + +#if(MIO_BN_SP_TEST_DEBUG == 3) + + auto debugvals = verify(verify_backward_bn_spatial_use_saved{ + input, dy_input, scale, savedMean, savedInvVar}); + auto gpuout = std::get<0>(debugvals.second); + auto cpuout = std::get<0>(debugvals.first); + + double maxdiff = 0.; + int mn = 0; + int mc = 0; + int mh = 0; + int mw = 0; + + for(int bidx = 0; bidx < n; bidx++) + { // via mini_batch + for(int cidx = 0; cidx < c; cidx++) + { // via mini_batch + for(int row = 0; row < h; row++) + { // via rows + for(int column = 0; column < w; column++) + { // via columns + double diff = + fabs(gpuout(bidx, cidx, row, column) - cpuout(bidx, cidx, row, column)); + if(diff > maxdiff) + { + maxdiff = diff; + mn = bidx; + mc = cidx; + mh = row; + mw = column; + } + // if(diff > 1.) + //{ + std::cout << "gpu[" << bidx << ", " << cidx << ", " << row << ", " << column + << "]: " << gpuout(bidx, cidx, row, column) << " :: "; + std::cout << "cpu[" << bidx << ", " << cidx << ", " << row << ", " << column + << "]: " << cpuout(bidx, cidx, row, column) << " :: "; + std::cout << "diff: " << diff << std::endl; + //} + } + } + } + } + if(maxdiff > 0) + { + std::cout << "Max diff: " << maxdiff << std::endl; + std::cout << "gpu[" << mn << ", " << mc << ", " << mh << ", " << mw + << "]: " << gpuout(mn, mc, mh, mw) << " :: "; + std::cout << "cpu[" << mn << ", " << mc << ", " << mh << ", " << mw + << "]: " << cpuout(mn, mc, mh, mw) << std::endl; + } +#else #if(MIO_BN_SP_TEST_DEBUG == 1) std::cout << "Running back propagation spatial with S set." << std::endl; #endif verify(verify_backward_bn_spatial_use_saved{ input, dy_input, scale, savedMean, savedInvVar}); +#endif } };