Merge branch '1.0.x' of github.com:AMDComputeLibraries/MLOpen into 1.0.x

ROCm · Jul 26, 2017 · 3ad1114 · 3ad1114
2 parents b297fc7 + f1ed0c8
commit 3ad1114
Show file tree

Hide file tree

Showing 4 changed files with 105 additions and 60 deletions.
diff --git a/driver/conv_driver.hpp b/driver/conv_driver.hpp
@@ -92,6 +92,9 @@ class ConvDriver : public Driver
         miopenCreateTensorDescriptor(&biasTensor);
 
         miopenCreateConvolutionDescriptor(&convDesc);
+
+        workspace_bwd_dev = nullptr;
+        workspace_fwd_dev = nullptr;
     }
 
     int AddCmdLineArgs();
@@ -373,24 +376,32 @@ int ConvDriver<T>::AllocateBuffersAndCopy()
     dwei_dev = std::unique_ptr<GPUMem>(new GPUMem(ctx, wei_sz, sizeof(float)));
     dout_dev = std::unique_ptr<GPUMem>(new GPUMem(ctx, out_sz, sizeof(float)));
     out_dev  = std::unique_ptr<GPUMem>(new GPUMem(ctx, out_sz, sizeof(float)));
-    workspace_bwd_dev =
-        std::unique_ptr<GPUMem>(new GPUMem(ctx, workSpaceSize_bwd / sizeof(T), sizeof(T)));
-    workspace_fwd_dev =
-        std::unique_ptr<GPUMem>(new GPUMem(ctx, workSpaceSize_fwd / sizeof(T), sizeof(T)));
-
-    in                 = std::vector<T>(in_sz);
-    din                = std::vector<T>(in_sz);
-    wei                = std::vector<T>(wei_sz);
-    dwei               = std::vector<T>(wei_sz, 0);
-    dout               = std::vector<T>(out_sz, 0);
-    out                = std::vector<T>(out_sz, 0);
-    workspace_bwd      = std::vector<T>(workSpaceSize_bwd / sizeof(T), 0);
-    workspace_fwd      = std::vector<T>(workSpaceSize_fwd / sizeof(T), 0);
-    outhost            = std::vector<T>(out_sz, 0);
-    workspace_bwd_host = std::vector<T>(workSpaceSize_bwd / sizeof(T), 0);
-    workspace_fwd_host = std::vector<T>(workSpaceSize_fwd / sizeof(T), 0);
-    dwei_host          = std::vector<T>(wei_sz, 0);
-    din_host           = std::vector<T>(in_sz, 0);
+    if(workSpaceSize_bwd != 0)
+    {
+        workspace_bwd_dev =
+            std::unique_ptr<GPUMem>(new GPUMem(ctx, workSpaceSize_bwd / sizeof(T), sizeof(T)));
+        workspace_bwd      = std::vector<T>(workSpaceSize_bwd / sizeof(T), 0);
+        workspace_bwd_host = std::vector<T>(workSpaceSize_bwd / sizeof(T), 0);
+    }
+    if(workSpaceSize_fwd != 0)
+    {
+        workspace_fwd_dev =
+            std::unique_ptr<GPUMem>(new GPUMem(ctx, workSpaceSize_fwd / sizeof(T), sizeof(T)));
+        workspace_fwd      = std::vector<T>(workSpaceSize_fwd / sizeof(T), 0);
+        workspace_fwd_host = std::vector<T>(workSpaceSize_fwd / sizeof(T), 0);
+    }
+
+    in   = std::vector<T>(in_sz);
+    din  = std::vector<T>(in_sz);
+    wei  = std::vector<T>(wei_sz);
+    dwei = std::vector<T>(wei_sz, 0);
+    dout = std::vector<T>(out_sz, 0);
+    out  = std::vector<T>(out_sz, 0);
+
+    outhost = std::vector<T>(out_sz, 0);
+
+    dwei_host = std::vector<T>(wei_sz, 0);
+    din_host  = std::vector<T>(in_sz, 0);
 
     std::string inFileName  = inflags.GetValueStr("in_data");
     std::string weiFileName = inflags.GetValueStr("weights");
@@ -487,21 +498,21 @@ int ConvDriver<T>::FindForward(int& ret_algo_count,
                                std::vector<miopenConvAlgoPerf_t>& perf_results)
 {
 
-    return miopenFindConvolutionForwardAlgorithm(GetHandle(),
-                                                 inputTensor,
-                                                 in_dev->GetMem(),
-                                                 weightTensor,
-                                                 wei_dev->GetMem(),
-                                                 convDesc,
-                                                 outputTensor,
-                                                 out_dev->GetMem(),
-                                                 request_algo_count,
-                                                 &ret_algo_count,
-                                                 perf_results.data(),
-                                                 workspace_fwd_dev->GetMem(),
-                                                 workspace_fwd_dev->GetSize(),
-                                                 (inflags.GetValueInt("search") == 1) ? true
-                                                                                      : false);
+    return miopenFindConvolutionForwardAlgorithm(
+        GetHandle(),
+        inputTensor,
+        in_dev->GetMem(),
+        weightTensor,
+        wei_dev->GetMem(),
+        convDesc,
+        outputTensor,
+        out_dev->GetMem(),
+        request_algo_count,
+        &ret_algo_count,
+        perf_results.data(),
+        (workspace_fwd_dev != nullptr) ? workspace_fwd_dev->GetMem() : nullptr,
+        (workspace_fwd_dev != nullptr) ? workspace_fwd_dev->GetSize() : 0,
+        (inflags.GetValueInt("search") == 1) ? true : false);
 }
 
 template <typename T>
@@ -536,8 +547,9 @@ int ConvDriver<T>::RunForwardGPU()
                                  &beta,
                                  outputTensor,
                                  out_dev->GetMem(),
-                                 workspace_fwd_dev->GetMem(),
-                                 workspace_fwd_dev->GetSize());
+                                 (workspace_fwd_dev != nullptr) ? workspace_fwd_dev->GetMem()
+                                                                : nullptr,
+                                 (workspace_fwd_dev != nullptr) ? workspace_fwd_dev->GetSize() : 0);
     }
 
     if(inflags.GetValueInt("time") == 1)

diff --git a/driver/pool_driver.hpp b/driver/pool_driver.hpp
@@ -445,7 +445,7 @@ int PoolDriver<T>::VerifyForward()
                                                           do_backward,
                                                           maskhost.data(),
                                                           mask.data(),
-                                                          (10e-6));
+                                                          1);
 
     printf(match ? "Forward Pooling Verifies on CPU and GPU\n"
                  : "Forward Pooling Verification Failed !!\n");

diff --git a/src/kernels/MIOpenConv1x1.cl b/src/kernels/MIOpenConv1x1.cl
@@ -40,11 +40,11 @@
 // calculating the size of the area for weights prefetch
 
 #if MLO_N_MAPS_PERGROUP > 1
-#define MLO_WEIGHTS_PER_LOOP_MAX 8
+#define MLO_WEIGHTS_PER_LOOP_MAX (8)
 #else
-#define MLO_WEIGHTS_PER_LOOP_MAX 16
+#define MLO_WEIGHTS_PER_LOOP_MAX (16)
 #endif
-#if((MLO_N_MAPS_PERGROUP * MLO_N_LCL_IN_MAPS) < MLO_N_INPUTS)
+#if(MLO_N_MAPS_PERGROUP * MLO_N_LCL_IN_MAPS < MLO_N_INPUTS)
 #define MLO_LCL_IN_ROW (MLO_N_MAPS_PERGROUP * MLO_N_LCL_IN_MAPS)
 #else
 #define MLO_LCL_IN_ROW (MLO_N_INPUTS)
@@ -57,7 +57,11 @@
 #else
 #define MLO_WEIGHTS_PER_LOOP (MLO_WEIGHTS_PER_LOOP_MAX)
 #endif
+
 #define MLO_LCL_WEIGHTS_ROW (MLO_WEIGHTS_PER_LOOP * MLO_LCL_IN_ROW)
+
+#define MLO_IN_LOOP ((MLO_N_INPUTS + MLO_LCL_WEIGHTS_ROW - 1) / MLO_LCL_WEIGHTS_ROW)
+
 #define MLO_WEIGHTS_ROW (MLO_LCL_WEIGHTS_ROW * MLO_WEI_CHANNEL_STRIDE)
 
 // size of the area for weights prefetch
@@ -112,14 +116,15 @@ After completion of the main MLO_IN_LOOP loop partial sums have been summed up i
 
 */
 
-__kernel void MIOpenConv1x1(const __global _FLOAT* __restrict in_ptr,
-                            const __global _FLOAT* __restrict wei_ptr,
+__attribute__((reqd_work_group_size(MLO_GRP_SZ0, MLO_GRP_SZ1, MLO_GRP_SZ2))) __kernel void
+MIOpenConv1x1(const __global _FLOAT* __restrict in_ptr,
+              const __global _FLOAT* __restrict wei_ptr,
 #if MLO_CONV_BIAS
-                            const __global _FLOAT* __restrict bias,
+              const __global _FLOAT* __restrict bias,
 #endif
-                            __global _FLOAT* __restrict out_ptr,
-                            UNUSED _FLOAT dummy_val // nothing
-                            )
+              __global _FLOAT* __restrict out_ptr,
+              UNUSED _FLOAT dummy_val // nothing
+              )
 {
     // KERNEL
     // private buffers
@@ -167,7 +172,7 @@ __kernel void MIOpenConv1x1(const __global _FLOAT* __restrict in_ptr,
         }
     }
     // over all input maps; with step == MLO_N_LCL_IN_MAPS * MLO_N_MAPS_PERGROUP; MLO_IN_LOOP
-    for(uint wc = 0; wc < MLO_IN_LOOP; wc += MLO_WEIGHTS_PER_LOOP)
+    for(uint wc = 0; wc < MLO_IN_LOOP; ++wc)
     {
         // read array of weights
         barrier(CLK_LOCAL_MEM_FENCE);
@@ -184,45 +189,46 @@ __kernel void MIOpenConv1x1(const __global _FLOAT* __restrict in_ptr,
             uint lwi = (w & (MLO_WEIGHTS_ROW - 1));
 #endif
 
-            uint wi = (wc * (MLO_N_LCL_IN_MAPS * MLO_N_MAPS_PERGROUP) + lwi) *
+            uint wi = (wc * MLO_LCL_WEIGHTS_ROW + lwi) *
 #if MLO_DIR_FORWARD == 1
                       MLO_WEI_CHANNEL_STRIDE;
 #else
                       MLO_WEI_BSTRIDE;
 #endif
 
             // out of range check
-            uint wei_off_r = wei_off + wi +
-                             oi *
+            uint wei_off1 = wei_off + wi +
+                            oi *
 #if MLO_DIR_FORWARD == 1
-                                 MLO_WEI_BSTRIDE;
+                                MLO_WEI_BSTRIDE;
 #else
-                                 MLO_WEI_CHANNEL_STRIDE;
+                                MLO_WEI_CHANNEL_STRIDE;
 #endif
 
-            wei_off_r        = (wei_off_r < MLO_N_OUTPUTS * MLO_N_INPUTS) ? wei_off_r : 0;
+            uint wei_off_r   = (wei_off1 < MLO_N_OUTPUTS * MLO_N_INPUTS) ? wei_off1 : 0;
             _FLOAT wei_val   = wei_ptr[wei_off_r];
-            wei_val          = (wei_off_r < MLO_N_OUTPUTS * MLO_N_INPUTS) ? wei_val : 0;
+            wei_val          = (wei_off1 < MLO_N_OUTPUTS * MLO_N_INPUTS) ? wei_val : 0;
             lcl_wei_stage[w] = wei_val;
         }
 
         barrier(CLK_LOCAL_MEM_FENCE);
-
+        uint c = 0;
 #if MLO_WEIGHTS_PER_LOOP > 7
 #pragma unroll(MLO_WEIGHTS_PER_LOOP / 8)
 #endif
         for(uint ci = 0; ci < MLO_WEIGHTS_PER_LOOP;
             ++ci, in_off += MLO_IN_CHANNEL_STRIDE * MLO_N_LCL_IN_MAPS * MLO_N_MAPS_PERGROUP)
         {
-            uint c        = wc + ci;
+            c             = wc * MLO_WEIGHTS_PER_LOOP + ci;
             uint wei_indx = ci;
 
             // read data
             // over all local batchs
             uint in_off1 = in_off;
+            uint in_off2 = 0;
             for(uint ib = 0; ib < MLO_N_LCL_BATCHS; ++ib, in_off1 += MLO_IN_BATCH_STRIDE)
             {
-                uint in_off2 = in_off1;
+                in_off2 = in_off1;
                 // lcl in maps (in data tiles) is has the stride = MLO_N_MAPS_PERGROUP
                 for(uint ilc = 0; ilc < MLO_N_LCL_IN_MAPS;
                     ++ilc, in_off2 += MLO_IN_CHANNEL_STRIDE * MLO_N_MAPS_PERGROUP)
@@ -234,6 +240,9 @@ __kernel void MIOpenConv1x1(const __global _FLOAT* __restrict in_ptr,
                         c * MLO_N_LCL_IN_MAPS * MLO_N_MAPS_PERGROUP + in_map_id +
                                 ilc * MLO_N_MAPS_PERGROUP <
                             MLO_N_INPUTS;
+#ifndef __AMDGCN__
+                    in_off2 = (v) ? in_off2 : 0;
+#endif
                     __global const _FLOAT* in_p = &in_ptr[in_off2];
 #if MLO_C1x1_PIXLEFT > 0
                     // if the last one
@@ -269,6 +278,7 @@ __kernel void MIOpenConv1x1(const __global _FLOAT* __restrict in_ptr,
                             in_stage[ib][ilc][i] = v ? val : 0.0f;
 #endif
 #if DBG_OUT_OF_RNGE
+
                             if(in_off2 + i >= MLO_IN_BATCH_STRIDE * MLO_BATCH_SZ)
                             {
                                 printf("k:err:in-of-range\n");
@@ -301,6 +311,23 @@ __kernel void MIOpenConv1x1(const __global _FLOAT* __restrict in_ptr,
                         for(uint i = 0; i < MLO_READ_UNIT; ++i)
                         {
                             out_tiles[ib][olc][i] += in_stage[ib][ilc][i] * wei_stage;
+
+#if 0 // MLO_DIR_FORWARD == 0
+							if ( in_stage[ib][ilc][i] * wei_stage!= 0 && out_grp_block * MLO_N_LCL_OUT_MAPS + olc == 0 && i == 0 && get_global_id(0) == 0 && get_global_id(1) == 0 && get_global_id(2) == 0)
+							{
+								printf("K:c: %d %d %d %d   %f %f %f %f\n",
+								wc, 
+								MLO_IN_LOOP,
+								MLO_WEIGHTS_PER_LOOP,
+								MLO_WEIGHTS_ROW,
+
+								out_tiles[ib][olc][i],
+								in_stage[ib][ilc][i] * wei_stage,
+								in_stage[ib][ilc][i],
+								 wei_stage
+								);
+							}
+#endif
                         }
                     }
                 }

diff --git a/src/mlo_dir_conv.cpp b/src/mlo_dir_conv.cpp
@@ -239,12 +239,14 @@ int mlo_construct_winograd::mloConstruct()
             {
                 return (mloConstructBinaryWinograd3x3U(rmv));
             }
+#if MIOPEN_BACKEND_OPENCL
             if(mloIsCorrectBinaryWinogradRxSFwd(rmv) &&
                !miopen::IsDisabled(MIOPEN_DEBUG_AMD_WINOGRAD_RXS{}) &&
                (no_perf_filtering || mloIsFastBinaryWinogradRxSFwd()))
             {
                 return (mloConstructBinaryWinogradRxSFwd());
             }
+#endif
         }
     }
 #endif
@@ -1395,7 +1397,7 @@ int mlo_construct_direct2D::mloConstructDirect2D1x1()
     // n of input map per group
     N_MAPS_PERGROUP = std::min(N_MAPS_PERGROUP, n_input_scaled);
     // number of input loops
-    int n_in_loop = (n_input_scaled + N_MAPS_PERGROUP - 1) / N_MAPS_PERGROUP;
+    //   int n_in_loop = (n_input_scaled + N_MAPS_PERGROUP - 1) / N_MAPS_PERGROUP;
 
     // number of batches inside wk_item
     _n_stacks = std::min(_batch_sz, _n_stacks);
@@ -1434,10 +1436,13 @@ int mlo_construct_direct2D::mloConstructDirect2D1x1()
         std::to_string(static_cast<long long>(wei_cstride))
         // algorithm parameters
         + std::string(" -DMLO_GRP_SZ0=") + std::to_string(static_cast<long long>(GRP_SZ)) +
+        std::string(" -DMLO_GRP_SZ1=") + std::to_string(1) + std::string(" -DMLO_GRP_SZ2=") +
+        std::to_string(1) +
+
         std::string(" -DMLO_MAP_SZ4=") + std::to_string(static_cast<long long>(MAP_SZ4)) +
         std::string(" -DMLO_C1x1_PIXLEFT=") + std::to_string(static_cast<long long>(C1x1_PIXLEFT)) +
         std::string(" -DMLO_DIVBY4=") + std::to_string(static_cast<long long>(DIVBY4)) +
-        std::string(" -DMLO_IN_LOOP=") + std::to_string(static_cast<long long>(n_in_loop)) +
+        // std::string(" -DMLO_IN_LOOP=") + std::to_string(static_cast<long long>(n_in_loop)) +
         std::string(" -DMLO_N_LCL_BATCHS=") +
         std::to_string(static_cast<long long>(_n_stacks)) // # of diff stacks (part of batch).
         + std::string(" -DMLO_N_LCL_OUT_MAPS=") +
@@ -3380,8 +3385,9 @@ bool mlo_construct_BwdWrW2D::mloIsFastAsmDirect3x3WrW() const
     // They work fine on gfx8
     // /todo fix memory faults on gfx9
     const std::string name = _stream->GetDeviceName();
-    return !(name == "gfx900" && (_in_width == 13 || _in_width == 27 || _in_width == 54 ||
-                                  _in_width == 57 || _in_width == 17 || _in_width == 250));
+    return !(name == "gfx900" &&
+             (_in_width == 13 || _in_width == 27 || _in_width == 54 || _in_width == 57 ||
+              _in_width == 17 || _in_width == 250 || _in_width == 175));
 }
 
 int mlo_construct_BwdWrW2D::mloConstructAsmDirect3x3WrW()