diff --git a/driver/bn_driver.hpp b/driver/bn_driver.hpp
index 08bc5bb693..98c287ac78 100644
--- a/driver/bn_driver.hpp
+++ b/driver/bn_driver.hpp
@@ -45,7 +45,7 @@
 #define MIO_BN_DEBUG 1
 #define MIO_BN_MAX_DEBUGLOOP 65536
 
-#define EPSILON 1e-6
+#define EPSILON 1e-4
 
 #define ERRTOL 1e-6
 #define RMSTOL 1e-6
@@ -1286,8 +1286,8 @@ int BatchNormDriver<T>::VerifyBackward()
     if(!back)
         return miopenStatusSuccess;
 
-    const double tolerance = ERRTOL;
-    const double maxrms    = RMSTOL;
+    const double tolerance = ERRTOL * 1000;
+    const double maxrms    = RMSTOL * 1000;
     double diff            = 0.;
     bool anError           = false;
 
diff --git a/driver/miopen_BatchNormHost.hpp b/driver/miopen_BatchNormHost.hpp
index 14ff4b2263..a99f6a014c 100644
--- a/driver/miopen_BatchNormHost.hpp
+++ b/driver/miopen_BatchNormHost.hpp
@@ -30,7 +30,7 @@
 #include <cmath>
 #include <iomanip>
 
-#define MIO_HEIRARCH_SEL 0
+#define MIO_HEIRARCH_SEL 1
 
 #if(MIO_HEIRARCH_SEL == 1)
 #define MIO_BN_DIST 32
@@ -38,8 +38,9 @@
 
 template <typename T>
 int miopenBNFwdTrainPerActivationRunHost(
-    /*	T alpha,
-            T beta,
+    /*
+        T alpha,
+        T beta,
     */
     int n_batchs,
     int channels,
@@ -743,7 +744,6 @@ int miopenBNBwdPerActivationRunHost(
                         dxhat += tmp1;
                         dxhathat += tmp1 * xhat[xhat_index];
                     } // end for(n_batchs)
-                    dscale_ptr[adjIndex] /= double(n_batchs);
 
                     for(int bidx = 0; bidx < n_batchs; bidx++)
                     { // via mini_batch
@@ -812,7 +812,6 @@ int miopenBNBwdPerActivationRunHost(
                         dxhat += tmp1;
                         dxhathat += tmp1 * xhat[xhat_index];
                     } // end for(n_batchs)
-                    dscale_ptr[adjIndex] /= double(n_batchs);
 
                     for(int bidx = 0; bidx < n_batchs; bidx++)
                     { // via mini_batch
@@ -891,7 +890,7 @@ int miopenBNBwdSpatialRunHost(
                     } // end for(n_batchs)
                 }     // for (column)
             }         // for (row)
-            dscale_ptr[cidx] /= NHW;
+
             // process the batch per channel
             for(int row = 0; row < height; row++)
             { // via rows
@@ -1087,17 +1086,15 @@ int miopenBNBwdSpatialRunHost(
             }
 #endif
 
-            dscale_ptr[cidx] /= NHW;
 // printf("dscale: %f\n",dscale_ptr[cidx]);
 // printf("dbias: %f\n",dbias_ptr[cidx]);
-// printf("HELLO BASTARDS!!!");
+
 #if(MIO_HEIRARCH_SEL == 0)
             for(int row = 0; row < height; row++)
             { // via rows
                 for(int column = 0; column < width; column++)
                 { // via columns
                     adjIndex = Csubindex + width * row + column;
-
                     for(int bidx = 0; bidx < n_batchs; bidx++)
                     { // via mini_batch
                         index         = in_nstride * bidx + adjIndex;
diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h
index a7b4462dc2..68af8a3f39 100644
--- a/include/miopen/miopen.h
+++ b/include/miopen/miopen.h
@@ -376,8 +376,8 @@ MIOPEN_EXPORT miopenStatus_t miopenGet4dTensorDescriptor(miopenTensorDescriptor_
  * @param tensorDesc   Tensor descriptor type (input)
  * @param dataType     Currently only miopenFloat is implemented (input)
  * @param nbDims       Number of dimensions in the dimsA array (input)
- * @param dimsA        Array containing the size of dimensions (output)
- * @param stridesA     Array containing the size of stride (output)
+ * @param dimsA        Array containing the size of dimensions (input)
+ * @param stridesA     Array containing the size of stride (input)
  * @return             miopenStatus_t
 */
 MIOPEN_EXPORT miopenStatus_t miopenSetTensorDescriptor(miopenTensorDescriptor_t tensorDesc,
diff --git a/src/batch_norm.cpp b/src/batch_norm.cpp
index 2d8585f154..0106925562 100644
--- a/src/batch_norm.cpp
+++ b/src/batch_norm.cpp
@@ -27,6 +27,8 @@
 #include <miopen/errors.hpp>
 #include <miopen/batch_norm.hpp>
 
+#define MIOPEN_BN_SYNCH 0
+
 namespace miopen {
 
 void DeriveBNTensorDescriptor(TensorDescriptor& derivedBnDesc,
@@ -73,7 +75,9 @@ inline void profileSequence(Handle& handle, unsigned char select)
         }
         else
         {
+#if(MIOPEN_BN_SYNCH)
             handle.Finish();
+#endif
         }
         break;
     case 1:
@@ -89,7 +93,9 @@ inline void profileSequence(Handle& handle, unsigned char select)
         }
         else
         {
+#if(MIOPEN_BN_SYNCH)
             handle.Finish();
+#endif
         }
         break;
 
@@ -382,7 +388,7 @@ void bnBwdTrainSelectMulti(Handle& handle,
 
         kernel_subname = kernel_name + "FinalDScale";
         handle.GetKernel(algo_name, network_config, program_name, kernel_subname, vld, vgd, parms)(
-            dx, dScale, inhw);
+            dx, dScale);
         profileSequence(handle, 1);
 
         kernel_subname = kernel_name + "DX";
@@ -433,7 +439,7 @@ void bnBwdTrainSelectMulti(Handle& handle,
 
         kernel_subname = kernel_name + "FinalDScale";
         handle.GetKernel(algo_name, network_config, program_name, kernel_subname, vld, vgd, parms)(
-            dx, dScale, inhw);
+            dx, dScale);
         profileSequence(handle, 1);
 
         kernel_subname = kernel_name + "DX";
diff --git a/src/kernels/MIOpenBatchNormBwdPerAct.cl b/src/kernels/MIOpenBatchNormBwdPerAct.cl
index 68ce1a24a4..7568087fb3 100644
--- a/src/kernels/MIOpenBatchNormBwdPerAct.cl
+++ b/src/kernels/MIOpenBatchNormBwdPerAct.cl
@@ -146,7 +146,6 @@ __kernel void BatchNormBwdPerActivationSaved(const __global _FLOAT* x_in,
                 dxhat += tmp1;
                 dxhathat = mad(tmp1, xhat, dxhathat);
             } // end for(n)
-            pvt_dscale /= (_FLOAT)N;
 
             for(int n = 0; n < N; n++)
             {
@@ -246,7 +245,6 @@ __kernel void BatchNormBwdPerActivation(const __global _FLOAT* x_in,
                 dxhat += tmp1;
                 dxhathat = mad(tmp1, xhat, dxhathat);
             } // end for(n)
-            pvt_dscale /= (_FLOAT)N;
 
             for(int n = 0; n < N; n++)
             {
diff --git a/src/kernels/MIOpenBatchNormBwdSpatial.cl b/src/kernels/MIOpenBatchNormBwdSpatial.cl
index 962b3d81be..f0559c76b3 100644
--- a/src/kernels/MIOpenBatchNormBwdSpatial.cl
+++ b/src/kernels/MIOpenBatchNormBwdSpatial.cl
@@ -101,9 +101,11 @@
 #undef __AMDGCN__
 #endif
 
-//#ifdef __AMDGCN__
-//#undef __AMDGCN__
-//#endif
+/*
+#ifdef __AMDGCN__
+#undef __AMDGCN__
+#endif
+*/
 
 // Disable specific warnings
 #ifdef __clang__
@@ -409,7 +411,7 @@ BatchNormBwdSpatial(const __global _FLOAT* __restrict x_in,
             lcl_data[lid] += lcl_data[lid + red];
         barrier(CLK_LOCAL_MEM_FENCE);
     }
-    dppLDSReduce64(&ds, lcl_data, lid, INHW);
+    dppLDSReduce64(&ds, lcl_data, lid, 1);
 #else
     for(unsigned int red = (MIO_BN_GRP0 >> 1); red > 256; red >>= 1)
     {
@@ -417,7 +419,7 @@ BatchNormBwdSpatial(const __global _FLOAT* __restrict x_in,
             lcl_data[lid] += lcl_data[lid + red];
         barrier(CLK_LOCAL_MEM_FENCE);
     }
-    regLDSreduce(&ds, lcl_data, lid, INHW);
+    regLDSreduce(&ds, lcl_data, lid, 1);
 #endif
 
     if(lid < MIO_BN_SEGMENT)
@@ -649,16 +651,14 @@ BatchNormBwdSpatial(const __global _FLOAT* __restrict x_in,
 #ifdef __AMDGCN__
 
 #if(MIO_BN_HW > 16)
-    dppRegReduce64(&ds, INHW);
+    dppRegReduce64(&ds, 1);
 #elif(MIO_BN_HW > 1)
-    dppRegReduce16(&ds, INHW);
-#else
-    ds *= INHW;
+    dppRegReduce16(&ds, 1);
 #endif // HW
 #else  // if not GCN
 
 #if(MIO_BN_HW > 16)
-    regLDSreduce(&ds, lcl_data, ylid, INHW);
+    regLDSreduce(&ds, lcl_data, ylid, 1);
 #elif(MIO_BN_HW > 1)
     lcl_data[ylid] = ds;
     barrier(CLK_LOCAL_MEM_FENCE);
@@ -668,9 +668,6 @@ BatchNormBwdSpatial(const __global _FLOAT* __restrict x_in,
     {
         ds += lcl_data[i];
     }
-    ds *= INHW;
-#else
-    ds *= INHW;
 #endif // HW
 #endif // GCN
     //===========================================
@@ -686,7 +683,7 @@ BatchNormBwdSpatial(const __global _FLOAT* __restrict x_in,
         {
             index         = n * MIO_BN_CHW + cidx + ylid;
             tmp1          = mad(NHW, dyvalues[n], -db);
-            tmp2          = -batchvalues[n] * ds;
+            tmp2          = -(batchvalues[n]) * ds;
             tmp3          = (pscale * invVar) * INHW;
             dx_out[index] = tmp3 * (tmp2 + tmp1);
         }
@@ -901,7 +898,7 @@ BatchNormBwdSpatial(const __global _FLOAT* __restrict x_in,
 #else // GCN
 
 #if(MIO_BN_N > 16)
-    regLDSreduce(&db, lcl_data, ylid, INHW);
+    regLDSreduce(&db, lcl_data, ylid, 1);
 #elif(MIO_BN_N > 1)
     lcl_data[ylid] = db;
     barrier(CLK_LOCAL_MEM_FENCE);
@@ -917,7 +914,7 @@ BatchNormBwdSpatial(const __global _FLOAT* __restrict x_in,
 #ifdef __AMDGCN__
 
 #if(MIO_BN_N > 16)
-    dppRegReduce64(&ds, INHW);
+    dppRegReduce64(&ds, 1);
 #elif(MIO_BN_N > 1)
     lcl_data[ylid] = ds;
     barrier(CLK_LOCAL_MEM_FENCE);
@@ -927,14 +924,11 @@ BatchNormBwdSpatial(const __global _FLOAT* __restrict x_in,
     {
         ds += lcl_data[i];
     }
-    ds *= INHW;
-#else
-    ds *= INHW;
 #endif // N
 #else  // if not GCN
 
 #if(MIO_BN_N > 16)
-    regLDSreduce(&ds, lcl_data, ylid, INHW);
+    regLDSreduce(&ds, lcl_data, ylid, 1);
 #elif(MIO_BN_N > 1)
     lcl_data[ylid] = ds;
     barrier(CLK_LOCAL_MEM_FENCE);
@@ -944,9 +938,6 @@ BatchNormBwdSpatial(const __global _FLOAT* __restrict x_in,
     {
         ds += lcl_data[i];
     }
-    ds *= INHW;
-#else
-    ds *= INHW;
 #endif // HW
 #endif // GCN
     //===========================================
@@ -962,7 +953,7 @@ BatchNormBwdSpatial(const __global _FLOAT* __restrict x_in,
         {
             index         = ylid * MIO_BN_CHW + cidx + hw;
             tmp1          = mad(NHW, dyvalues[hw], -db);
-            tmp2          = -batchvalues[hw] * ds;
+            tmp2          = -(batchvalues[hw]) * ds;
             tmp3          = (pscale * invVar) * INHW;
             dx_out[index] = tmp3 * (tmp2 + tmp1);
         }
@@ -1231,7 +1222,7 @@ BatchNormBwdSpatial(const __global _FLOAT* __restrict x_in,
     barrier(CLK_LOCAL_MEM_FENCE);
     lcl_data[ylid] = ds;
     barrier(CLK_LOCAL_MEM_FENCE);
-    dppLDSReduce64(&ds, lcl_data, ylid, INHW);
+    dppLDSReduce64(&ds, lcl_data, ylid, 1);
 
 #else
     for(unsigned int red = (MIO_BN_GRP1 >> 1); red > 256; red >>= 1)
@@ -1240,7 +1231,7 @@ BatchNormBwdSpatial(const __global _FLOAT* __restrict x_in,
             lcl_data[ylid] += lcl_data[ylid + red];
         barrier(CLK_LOCAL_MEM_FENCE);
     }
-    regLDSreduce(&ds, lcl_data, ylid, INHW);
+    regLDSreduce(&ds, lcl_data, ylid, 1);
 #endif
     //===========================================
 
@@ -1256,7 +1247,7 @@ BatchNormBwdSpatial(const __global _FLOAT* __restrict x_in,
             index         = n * MIO_BN_CHW + cidx + ylid;
 #if(MIO_BN_N < MIO_BN_MAXN)
             tmp1          = mad(NHW, dyvalues[n], -db);
-            tmp2          = -batchvalues[n] * ds;
+            tmp2          = -(batchvalues[n]) * ds;
 #else
             tmp1 = mad(NHW, dy_in[index], -db);
             tmp2 = -(x_in[index] - mean) * invVar * ds;
@@ -1761,7 +1752,7 @@ BatchNormBwdSpatialDScale(const __global _FLOAT* x_in,
         unsigned int varstashindex  = cidx + ygrp_sz * ygrp_id + 3;
         lmean                       = buff[meanstashindex]; // load stashed mean
         livar                       = buff[varstashindex];
-#else  // SAVED
+#else  // NO SAVED
         lmean = savedMean[xgid];
         livar = savedInvVariance[xgid];
 #endif // SAVED
@@ -1781,6 +1772,7 @@ BatchNormBwdSpatialDScale(const __global _FLOAT* x_in,
             elemStd = x_in[index] - mean; // (x_i - mean)
             xhat    = elemStd * invVar;
             dscale  = mad(xhat, dy_in[index], dscale);
+            // dscale += 1.;
         } // end for
     }     // end if
 
@@ -1809,7 +1801,6 @@ BatchNormBwdSpatialDScale(const __global _FLOAT* x_in,
     regLDSreduce(&dscale, lcl_data, ylid, 1);
 
 #endif // GCN
-
     if(ylid == 0)
     {
         unsigned int gammaindex = cidx + ygrp_sz * ygrp_id + 4;
@@ -1818,7 +1809,7 @@ BatchNormBwdSpatialDScale(const __global _FLOAT* x_in,
 }
 
 __attribute__((reqd_work_group_size(MIO_BN_GRP0, MIO_BN_GRP1, MIO_BN_GRP2))) __kernel void
-BatchNormBwdSpatialFinalDScale(__global _FLOAT* buff, __global _FLOAT* delta_scale, _FLOAT INHW)
+BatchNormBwdSpatialFinalDScale(__global _FLOAT* buff, __global _FLOAT* delta_scale)
 {
 
     __private _FLOAT ds  = 0.;
@@ -1852,7 +1843,7 @@ BatchNormBwdSpatialFinalDScale(__global _FLOAT* buff, __global _FLOAT* delta_sca
             lcl_data[ylid] += lcl_data[ylid + red];
         barrier(CLK_LOCAL_MEM_FENCE);
     }
-    dppLDSReduce64(&ds, lcl_data, ylid, INHW);
+    dppLDSReduce64(&ds, lcl_data, ylid, 1);
 #else  // GCN
     for(unsigned int red = (MIO_BN_GRP1 >> 1); red > 256; red >>= 1)
     {
@@ -1860,21 +1851,21 @@ BatchNormBwdSpatialFinalDScale(__global _FLOAT* buff, __global _FLOAT* delta_sca
             lcl_data[ylid] += lcl_data[ylid + red];
         barrier(CLK_LOCAL_MEM_FENCE);
     }
-    regLDSreduce(&ds, lcl_data, ylid, INHW);
+    regLDSreduce(&ds, lcl_data, ylid, 1);
 #endif // GCN
 
 #elif(MIO_BN_NGRPS <= 64)
 
 #ifdef __AMDGCN__
-    dppRegReduce64(&ds, INHW);
+    dppRegReduce64(&ds, 1);
 #else // GCN
     __local _FLOAT lcl_data[MIO_BN_LDS_SIZE];
-    regLDSreduce(&ds, lcl_data, ylid, INHW);
+    regLDSreduce(&ds, lcl_data, ylid, 1);
 #endif // GCN
 #else  // else < 16
 
 #ifdef __AMDGCN__
-    dppRegReduce16(&ds, INHW);
+    dppRegReduce16(&ds, 1);
 #else // GCN
     __local _FLOAT lcl_data[MIO_BN_LDS_SIZE];
     lcl_data[ylid] = ds;
@@ -1885,7 +1876,6 @@ BatchNormBwdSpatialFinalDScale(__global _FLOAT* buff, __global _FLOAT* delta_sca
     {
         ds += lcl_data[i];
     }
-    ds *= INHW;
 #endif // end AMDGCN
 #endif // NGRPS
 
diff --git a/test/bn_peract_test.cpp b/test/bn_peract_test.cpp
index cf1a112d96..76751bcb68 100644
--- a/test/bn_peract_test.cpp
+++ b/test/bn_peract_test.cpp
@@ -48,7 +48,7 @@
 // Run CPU emulations in hierarchical reduction mode.
 //#define MIO_HEIRARCH_SEL 0
 #define MIO_BN_TEST_EXPAVGFACTOR 0.1
-#define MIO_BN_TEST_EPSILON 1e-6
+#define MIO_BN_TEST_EPSILON 1e-5
 
 //****************************************************
 // FORWARD TRAIN
@@ -577,7 +577,6 @@ struct verify_backward_bn_per_activation_use_saved
                         dxhathat += tmp1 * xhat[xhat_index];
 
                     } // end for(n_batchs)
-                    dscale(0, cidx, row, column) /= n;
 
                     for(int bidx = 0; bidx < n_batch; bidx++)
                     { // via mini_batch
@@ -771,7 +770,6 @@ struct verify_backward_bn_per_activation_recalc
                         dxhathat += tmp1 * xhat[xhat_index];
 
                     } // end for(n_batchs)
-                    dscale(0, cidx, row, column) /= n;
 
                     for(int bidx = 0; bidx < n_batch; bidx++)
                     { // via mini_batch
diff --git a/test/bn_spatial_test.cpp b/test/bn_spatial_test.cpp
index 57c5349139..8c13486e4b 100644
--- a/test/bn_spatial_test.cpp
+++ b/test/bn_spatial_test.cpp
@@ -41,11 +41,11 @@
 #include <miopen/miopen.h>
 #include <miopen/tensor.hpp>
 #include <utility>
-
+#include <cfloat>
 // Run CPU emulations in hierarchical reduction mode.
 #define MIO_HEIRARCH_SEL 1
 #define MIO_BN_TEST_EXPAVGFACTOR 0.1
-#define MIO_BN_TEST_EPSILON 1e-6
+#define MIO_BN_TEST_EPSILON 1e-5 // FLT_EPSILON
 #define MIO_BN_SP_TEST_DEBUG 0
 
 //****************************************************
@@ -717,6 +717,8 @@ struct verify_backward_bn_spatial_recalc
                         dyelem              = dy_input(bidx,cidx,row,column);
                         dshift_accum_arr[row] += dyelem;
                         dscale_accum_arr[row] += xhat[xhat_index]*dyelem;
+                        //dscale_accum_arr[row] += x_input(bidx,cidx,row,column);;//dscale_accum_arr[row] += xhat[xhat_index];
+                        //dscale_accum_arr[row] += 1.0;//DEBUG
                     }	
                 }// for (column)
             }// for (row)  
@@ -725,7 +727,6 @@ struct verify_backward_bn_spatial_recalc
                 dscale(0,cidx,0,0) += dscale_accum_arr[i];    
             }
 #endif
-            dscale(0, cidx, 0, 0) /= nhw;
 
             for(int row = 0; row < height; row++)
             { // via rows
@@ -925,9 +926,11 @@ struct verify_backward_bn_spatial_use_saved
                         //per (x-dims) channel load a block of data into LDS
                         elemStd             = x_input(bidx,cidx,row,column) - mean;// (x_i - mean)
                         xhat[xhat_index]    = elemStd*invVar;
+                        //printf("xhat[%d]: %lf\n",xhat_index,xhat[xhat_index]);
                         dyelem              = dy_input(bidx,cidx,row,column);
                         dshift_accum_arr[row] += dyelem;
                         dscale_accum_arr[row] += xhat[xhat_index]*dyelem;
+                        //dscale_accum_arr[row] += 1.0;//DEBUG
                     }	
                 }// for (column)
             }// for (row)  
@@ -936,7 +939,6 @@ struct verify_backward_bn_spatial_use_saved
                 dscale(0,cidx,0,0) += dscale_accum_arr[i];    
             }
 #endif
-            dscale(0, cidx, 0, 0) /= nhw;
 
             for(int row = 0; row < height; row++)
             { // via rows
@@ -1066,6 +1068,7 @@ struct batch_norm_spatial_driver : test_driver
     batch_norm_spatial_driver()
     {
         this->batch_factor = 4;
+
         // this->verbose=true;
         add(input, "input", get_bn_spatial_input_tensor());
     }
@@ -1101,7 +1104,7 @@ struct batch_norm_spatial_driver : test_driver
 #if(MIO_BN_SP_TEST_DEBUG == 1)
         std::cout << "Running forward inference spatial recalc." << std::endl;
 #endif
-
+        // this->tolerance = 80;
         // Debug values
         // std::fill(input.begin(), input.end(), 1);
         // std::fill(scale.begin(), scale.end(), 1);
@@ -1118,7 +1121,19 @@ struct batch_norm_spatial_driver : test_driver
 
         // backprop recalc
         auto dy_input = std::get<0>(outpair.second);
-
+        for(int bidx = 0; bidx < n; bidx++)
+        { // via mini_batch
+            for(int cidx = 0; cidx < c; cidx++)
+            { // via mini_batch
+                for(int row = 0; row < h; row++)
+                { // via rows
+                    for(int column = 0; column < w; column++)
+                    {
+                        dy_input(bidx, cidx, row, column) *= 0.1;
+                    }
+                }
+            }
+        }
 #if(MIO_BN_SP_TEST_DEBUG == 2)
         auto debugvals = verify(verify_backward_bn_spatial_recalc<T>{input, dy_input, scale});
         auto gpuout    = std::get<0>(debugvals.second);
@@ -1148,16 +1163,14 @@ struct batch_norm_spatial_driver : test_driver
                             mh      = row;
                             mw      = column;
                         }
-                        if(diff > 1.)
-                        {
-                            std::cout << "gpu[" << bidx << ", " << cidx << ", " << row << ", "
-                                      << column << "]: " << gpuout(bidx, cidx, row, column)
-                                      << " :: ";
-                            std::cout << "cpu[" << bidx << ", " << cidx << ", " << row << ", "
-                                      << column << "]: " << cpuout(bidx, cidx, row, column)
-                                      << " :: ";
-                            std::cout << "diff: " << diff << std::endl;
-                        }
+                        // if(diff > 1.)
+                        // {
+                        std::cout << "gpu[" << bidx << ", " << cidx << ", " << row << ", " << column
+                                  << "]: " << gpuout(bidx, cidx, row, column) << " :: ";
+                        std::cout << "cpu[" << bidx << ", " << cidx << ", " << row << ", " << column
+                                  << "]: " << cpuout(bidx, cidx, row, column) << " :: ";
+                        std::cout << "diff: " << diff << std::endl;
+                        //    }
                     }
                 }
             }
@@ -1174,17 +1187,72 @@ struct batch_norm_spatial_driver : test_driver
 #if(MIO_BN_SP_TEST_DEBUG == 1)
         std::cout << "Running back propagation spatial recalc." << std::endl;
 #endif
+        this->tolerance = 80 * input.desc.GetElementSize();
         verify(verify_backward_bn_spatial_recalc<T>{input, dy_input, scale});
 #endif
 
         // backprop use saved values
         auto savedMean   = std::get<3>(outpair.second);
         auto savedInvVar = std::get<4>(outpair.second);
+
+#if(MIO_BN_SP_TEST_DEBUG == 3)
+
+        auto debugvals = verify(verify_backward_bn_spatial_use_saved<T>{
+            input, dy_input, scale, savedMean, savedInvVar});
+        auto gpuout = std::get<0>(debugvals.second);
+        auto cpuout = std::get<0>(debugvals.first);
+
+        double maxdiff = 0.;
+        int mn         = 0;
+        int mc         = 0;
+        int mh         = 0;
+        int mw         = 0;
+
+        for(int bidx = 0; bidx < n; bidx++)
+        { // via mini_batch
+            for(int cidx = 0; cidx < c; cidx++)
+            { // via mini_batch
+                for(int row = 0; row < h; row++)
+                { // via rows
+                    for(int column = 0; column < w; column++)
+                    { // via columns
+                        double diff =
+                            fabs(gpuout(bidx, cidx, row, column) - cpuout(bidx, cidx, row, column));
+                        if(diff > maxdiff)
+                        {
+                            maxdiff = diff;
+                            mn      = bidx;
+                            mc      = cidx;
+                            mh      = row;
+                            mw      = column;
+                        }
+                        // if(diff > 1.)
+                        //{
+                        std::cout << "gpu[" << bidx << ", " << cidx << ", " << row << ", " << column
+                                  << "]: " << gpuout(bidx, cidx, row, column) << " :: ";
+                        std::cout << "cpu[" << bidx << ", " << cidx << ", " << row << ", " << column
+                                  << "]: " << cpuout(bidx, cidx, row, column) << " :: ";
+                        std::cout << "diff: " << diff << std::endl;
+                        //}
+                    }
+                }
+            }
+        }
+        if(maxdiff > 0)
+        {
+            std::cout << "Max diff: " << maxdiff << std::endl;
+            std::cout << "gpu[" << mn << ", " << mc << ", " << mh << ", " << mw
+                      << "]: " << gpuout(mn, mc, mh, mw) << " :: ";
+            std::cout << "cpu[" << mn << ", " << mc << ", " << mh << ", " << mw
+                      << "]: " << cpuout(mn, mc, mh, mw) << std::endl;
+        }
+#else
 #if(MIO_BN_SP_TEST_DEBUG == 1)
         std::cout << "Running back propagation spatial with S set." << std::endl;
 #endif
         verify(verify_backward_bn_spatial_use_saved<T>{
             input, dy_input, scale, savedMean, savedInvVar});
+#endif
     }
 };