Skip to content

Commit

Permalink
Merge branch '1.0.x' of github.com:AMDComputeLibraries/MLOpen into 1.0.x
Browse files Browse the repository at this point in the history
  • Loading branch information
pfultz2 committed Jul 26, 2017
2 parents b297fc7 + f1ed0c8 commit 3ad1114
Show file tree
Hide file tree
Showing 4 changed files with 105 additions and 60 deletions.
82 changes: 47 additions & 35 deletions driver/conv_driver.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,9 @@ class ConvDriver : public Driver
miopenCreateTensorDescriptor(&biasTensor);

miopenCreateConvolutionDescriptor(&convDesc);

workspace_bwd_dev = nullptr;
workspace_fwd_dev = nullptr;
}

int AddCmdLineArgs();
Expand Down Expand Up @@ -373,24 +376,32 @@ int ConvDriver<T>::AllocateBuffersAndCopy()
dwei_dev = std::unique_ptr<GPUMem>(new GPUMem(ctx, wei_sz, sizeof(float)));
dout_dev = std::unique_ptr<GPUMem>(new GPUMem(ctx, out_sz, sizeof(float)));
out_dev = std::unique_ptr<GPUMem>(new GPUMem(ctx, out_sz, sizeof(float)));
workspace_bwd_dev =
std::unique_ptr<GPUMem>(new GPUMem(ctx, workSpaceSize_bwd / sizeof(T), sizeof(T)));
workspace_fwd_dev =
std::unique_ptr<GPUMem>(new GPUMem(ctx, workSpaceSize_fwd / sizeof(T), sizeof(T)));

in = std::vector<T>(in_sz);
din = std::vector<T>(in_sz);
wei = std::vector<T>(wei_sz);
dwei = std::vector<T>(wei_sz, 0);
dout = std::vector<T>(out_sz, 0);
out = std::vector<T>(out_sz, 0);
workspace_bwd = std::vector<T>(workSpaceSize_bwd / sizeof(T), 0);
workspace_fwd = std::vector<T>(workSpaceSize_fwd / sizeof(T), 0);
outhost = std::vector<T>(out_sz, 0);
workspace_bwd_host = std::vector<T>(workSpaceSize_bwd / sizeof(T), 0);
workspace_fwd_host = std::vector<T>(workSpaceSize_fwd / sizeof(T), 0);
dwei_host = std::vector<T>(wei_sz, 0);
din_host = std::vector<T>(in_sz, 0);
if(workSpaceSize_bwd != 0)
{
workspace_bwd_dev =
std::unique_ptr<GPUMem>(new GPUMem(ctx, workSpaceSize_bwd / sizeof(T), sizeof(T)));
workspace_bwd = std::vector<T>(workSpaceSize_bwd / sizeof(T), 0);
workspace_bwd_host = std::vector<T>(workSpaceSize_bwd / sizeof(T), 0);
}
if(workSpaceSize_fwd != 0)
{
workspace_fwd_dev =
std::unique_ptr<GPUMem>(new GPUMem(ctx, workSpaceSize_fwd / sizeof(T), sizeof(T)));
workspace_fwd = std::vector<T>(workSpaceSize_fwd / sizeof(T), 0);
workspace_fwd_host = std::vector<T>(workSpaceSize_fwd / sizeof(T), 0);
}

in = std::vector<T>(in_sz);
din = std::vector<T>(in_sz);
wei = std::vector<T>(wei_sz);
dwei = std::vector<T>(wei_sz, 0);
dout = std::vector<T>(out_sz, 0);
out = std::vector<T>(out_sz, 0);

outhost = std::vector<T>(out_sz, 0);

dwei_host = std::vector<T>(wei_sz, 0);
din_host = std::vector<T>(in_sz, 0);

std::string inFileName = inflags.GetValueStr("in_data");
std::string weiFileName = inflags.GetValueStr("weights");
Expand Down Expand Up @@ -487,21 +498,21 @@ int ConvDriver<T>::FindForward(int& ret_algo_count,
std::vector<miopenConvAlgoPerf_t>& perf_results)
{

return miopenFindConvolutionForwardAlgorithm(GetHandle(),
inputTensor,
in_dev->GetMem(),
weightTensor,
wei_dev->GetMem(),
convDesc,
outputTensor,
out_dev->GetMem(),
request_algo_count,
&ret_algo_count,
perf_results.data(),
workspace_fwd_dev->GetMem(),
workspace_fwd_dev->GetSize(),
(inflags.GetValueInt("search") == 1) ? true
: false);
return miopenFindConvolutionForwardAlgorithm(
GetHandle(),
inputTensor,
in_dev->GetMem(),
weightTensor,
wei_dev->GetMem(),
convDesc,
outputTensor,
out_dev->GetMem(),
request_algo_count,
&ret_algo_count,
perf_results.data(),
(workspace_fwd_dev != nullptr) ? workspace_fwd_dev->GetMem() : nullptr,
(workspace_fwd_dev != nullptr) ? workspace_fwd_dev->GetSize() : 0,
(inflags.GetValueInt("search") == 1) ? true : false);
}

template <typename T>
Expand Down Expand Up @@ -536,8 +547,9 @@ int ConvDriver<T>::RunForwardGPU()
&beta,
outputTensor,
out_dev->GetMem(),
workspace_fwd_dev->GetMem(),
workspace_fwd_dev->GetSize());
(workspace_fwd_dev != nullptr) ? workspace_fwd_dev->GetMem()
: nullptr,
(workspace_fwd_dev != nullptr) ? workspace_fwd_dev->GetSize() : 0);
}

if(inflags.GetValueInt("time") == 1)
Expand Down
2 changes: 1 addition & 1 deletion driver/pool_driver.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -445,7 +445,7 @@ int PoolDriver<T>::VerifyForward()
do_backward,
maskhost.data(),
mask.data(),
(10e-6));
1);

printf(match ? "Forward Pooling Verifies on CPU and GPU\n"
: "Forward Pooling Verification Failed !!\n");
Expand Down
67 changes: 47 additions & 20 deletions src/kernels/MIOpenConv1x1.cl
Original file line number Diff line number Diff line change
Expand Up @@ -40,11 +40,11 @@
// calculating the size of the area for weights prefetch

#if MLO_N_MAPS_PERGROUP > 1
#define MLO_WEIGHTS_PER_LOOP_MAX 8
#define MLO_WEIGHTS_PER_LOOP_MAX (8)
#else
#define MLO_WEIGHTS_PER_LOOP_MAX 16
#define MLO_WEIGHTS_PER_LOOP_MAX (16)
#endif
#if((MLO_N_MAPS_PERGROUP * MLO_N_LCL_IN_MAPS) < MLO_N_INPUTS)
#if(MLO_N_MAPS_PERGROUP * MLO_N_LCL_IN_MAPS < MLO_N_INPUTS)
#define MLO_LCL_IN_ROW (MLO_N_MAPS_PERGROUP * MLO_N_LCL_IN_MAPS)
#else
#define MLO_LCL_IN_ROW (MLO_N_INPUTS)
Expand All @@ -57,7 +57,11 @@
#else
#define MLO_WEIGHTS_PER_LOOP (MLO_WEIGHTS_PER_LOOP_MAX)
#endif

#define MLO_LCL_WEIGHTS_ROW (MLO_WEIGHTS_PER_LOOP * MLO_LCL_IN_ROW)

#define MLO_IN_LOOP ((MLO_N_INPUTS + MLO_LCL_WEIGHTS_ROW - 1) / MLO_LCL_WEIGHTS_ROW)

#define MLO_WEIGHTS_ROW (MLO_LCL_WEIGHTS_ROW * MLO_WEI_CHANNEL_STRIDE)

// size of the area for weights prefetch
Expand Down Expand Up @@ -112,14 +116,15 @@ After completion of the main MLO_IN_LOOP loop partial sums have been summed up i
*/

__kernel void MIOpenConv1x1(const __global _FLOAT* __restrict in_ptr,
const __global _FLOAT* __restrict wei_ptr,
__attribute__((reqd_work_group_size(MLO_GRP_SZ0, MLO_GRP_SZ1, MLO_GRP_SZ2))) __kernel void
MIOpenConv1x1(const __global _FLOAT* __restrict in_ptr,
const __global _FLOAT* __restrict wei_ptr,
#if MLO_CONV_BIAS
const __global _FLOAT* __restrict bias,
const __global _FLOAT* __restrict bias,
#endif
__global _FLOAT* __restrict out_ptr,
UNUSED _FLOAT dummy_val // nothing
)
__global _FLOAT* __restrict out_ptr,
UNUSED _FLOAT dummy_val // nothing
)
{
// KERNEL
// private buffers
Expand Down Expand Up @@ -167,7 +172,7 @@ __kernel void MIOpenConv1x1(const __global _FLOAT* __restrict in_ptr,
}
}
// over all input maps; with step == MLO_N_LCL_IN_MAPS * MLO_N_MAPS_PERGROUP; MLO_IN_LOOP
for(uint wc = 0; wc < MLO_IN_LOOP; wc += MLO_WEIGHTS_PER_LOOP)
for(uint wc = 0; wc < MLO_IN_LOOP; ++wc)
{
// read array of weights
barrier(CLK_LOCAL_MEM_FENCE);
Expand All @@ -184,45 +189,46 @@ __kernel void MIOpenConv1x1(const __global _FLOAT* __restrict in_ptr,
uint lwi = (w & (MLO_WEIGHTS_ROW - 1));
#endif

uint wi = (wc * (MLO_N_LCL_IN_MAPS * MLO_N_MAPS_PERGROUP) + lwi) *
uint wi = (wc * MLO_LCL_WEIGHTS_ROW + lwi) *
#if MLO_DIR_FORWARD == 1
MLO_WEI_CHANNEL_STRIDE;
#else
MLO_WEI_BSTRIDE;
#endif

// out of range check
uint wei_off_r = wei_off + wi +
oi *
uint wei_off1 = wei_off + wi +
oi *
#if MLO_DIR_FORWARD == 1
MLO_WEI_BSTRIDE;
MLO_WEI_BSTRIDE;
#else
MLO_WEI_CHANNEL_STRIDE;
MLO_WEI_CHANNEL_STRIDE;
#endif

wei_off_r = (wei_off_r < MLO_N_OUTPUTS * MLO_N_INPUTS) ? wei_off_r : 0;
uint wei_off_r = (wei_off1 < MLO_N_OUTPUTS * MLO_N_INPUTS) ? wei_off1 : 0;
_FLOAT wei_val = wei_ptr[wei_off_r];
wei_val = (wei_off_r < MLO_N_OUTPUTS * MLO_N_INPUTS) ? wei_val : 0;
wei_val = (wei_off1 < MLO_N_OUTPUTS * MLO_N_INPUTS) ? wei_val : 0;
lcl_wei_stage[w] = wei_val;
}

barrier(CLK_LOCAL_MEM_FENCE);

uint c = 0;
#if MLO_WEIGHTS_PER_LOOP > 7
#pragma unroll(MLO_WEIGHTS_PER_LOOP / 8)
#endif
for(uint ci = 0; ci < MLO_WEIGHTS_PER_LOOP;
++ci, in_off += MLO_IN_CHANNEL_STRIDE * MLO_N_LCL_IN_MAPS * MLO_N_MAPS_PERGROUP)
{
uint c = wc + ci;
c = wc * MLO_WEIGHTS_PER_LOOP + ci;
uint wei_indx = ci;

// read data
// over all local batchs
uint in_off1 = in_off;
uint in_off2 = 0;
for(uint ib = 0; ib < MLO_N_LCL_BATCHS; ++ib, in_off1 += MLO_IN_BATCH_STRIDE)
{
uint in_off2 = in_off1;
in_off2 = in_off1;
// lcl in maps (in data tiles) is has the stride = MLO_N_MAPS_PERGROUP
for(uint ilc = 0; ilc < MLO_N_LCL_IN_MAPS;
++ilc, in_off2 += MLO_IN_CHANNEL_STRIDE * MLO_N_MAPS_PERGROUP)
Expand All @@ -234,6 +240,9 @@ __kernel void MIOpenConv1x1(const __global _FLOAT* __restrict in_ptr,
c * MLO_N_LCL_IN_MAPS * MLO_N_MAPS_PERGROUP + in_map_id +
ilc * MLO_N_MAPS_PERGROUP <
MLO_N_INPUTS;
#ifndef __AMDGCN__
in_off2 = (v) ? in_off2 : 0;
#endif
__global const _FLOAT* in_p = &in_ptr[in_off2];
#if MLO_C1x1_PIXLEFT > 0
// if the last one
Expand Down Expand Up @@ -269,6 +278,7 @@ __kernel void MIOpenConv1x1(const __global _FLOAT* __restrict in_ptr,
in_stage[ib][ilc][i] = v ? val : 0.0f;
#endif
#if DBG_OUT_OF_RNGE

if(in_off2 + i >= MLO_IN_BATCH_STRIDE * MLO_BATCH_SZ)
{
printf("k:err:in-of-range\n");
Expand Down Expand Up @@ -301,6 +311,23 @@ __kernel void MIOpenConv1x1(const __global _FLOAT* __restrict in_ptr,
for(uint i = 0; i < MLO_READ_UNIT; ++i)
{
out_tiles[ib][olc][i] += in_stage[ib][ilc][i] * wei_stage;

#if 0 // MLO_DIR_FORWARD == 0
if ( in_stage[ib][ilc][i] * wei_stage!= 0 && out_grp_block * MLO_N_LCL_OUT_MAPS + olc == 0 && i == 0 && get_global_id(0) == 0 && get_global_id(1) == 0 && get_global_id(2) == 0)
{
printf("K:c: %d %d %d %d %f %f %f %f\n",
wc,
MLO_IN_LOOP,
MLO_WEIGHTS_PER_LOOP,
MLO_WEIGHTS_ROW,

out_tiles[ib][olc][i],
in_stage[ib][ilc][i] * wei_stage,
in_stage[ib][ilc][i],
wei_stage
);
}
#endif
}
}
}
Expand Down
14 changes: 10 additions & 4 deletions src/mlo_dir_conv.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -239,12 +239,14 @@ int mlo_construct_winograd::mloConstruct()
{
return (mloConstructBinaryWinograd3x3U(rmv));
}
#if MIOPEN_BACKEND_OPENCL
if(mloIsCorrectBinaryWinogradRxSFwd(rmv) &&
!miopen::IsDisabled(MIOPEN_DEBUG_AMD_WINOGRAD_RXS{}) &&
(no_perf_filtering || mloIsFastBinaryWinogradRxSFwd()))
{
return (mloConstructBinaryWinogradRxSFwd());
}
#endif
}
}
#endif
Expand Down Expand Up @@ -1395,7 +1397,7 @@ int mlo_construct_direct2D::mloConstructDirect2D1x1()
// n of input map per group
N_MAPS_PERGROUP = std::min(N_MAPS_PERGROUP, n_input_scaled);
// number of input loops
int n_in_loop = (n_input_scaled + N_MAPS_PERGROUP - 1) / N_MAPS_PERGROUP;
// int n_in_loop = (n_input_scaled + N_MAPS_PERGROUP - 1) / N_MAPS_PERGROUP;

// number of batches inside wk_item
_n_stacks = std::min(_batch_sz, _n_stacks);
Expand Down Expand Up @@ -1434,10 +1436,13 @@ int mlo_construct_direct2D::mloConstructDirect2D1x1()
std::to_string(static_cast<long long>(wei_cstride))
// algorithm parameters
+ std::string(" -DMLO_GRP_SZ0=") + std::to_string(static_cast<long long>(GRP_SZ)) +
std::string(" -DMLO_GRP_SZ1=") + std::to_string(1) + std::string(" -DMLO_GRP_SZ2=") +
std::to_string(1) +

std::string(" -DMLO_MAP_SZ4=") + std::to_string(static_cast<long long>(MAP_SZ4)) +
std::string(" -DMLO_C1x1_PIXLEFT=") + std::to_string(static_cast<long long>(C1x1_PIXLEFT)) +
std::string(" -DMLO_DIVBY4=") + std::to_string(static_cast<long long>(DIVBY4)) +
std::string(" -DMLO_IN_LOOP=") + std::to_string(static_cast<long long>(n_in_loop)) +
// std::string(" -DMLO_IN_LOOP=") + std::to_string(static_cast<long long>(n_in_loop)) +
std::string(" -DMLO_N_LCL_BATCHS=") +
std::to_string(static_cast<long long>(_n_stacks)) // # of diff stacks (part of batch).
+ std::string(" -DMLO_N_LCL_OUT_MAPS=") +
Expand Down Expand Up @@ -3380,8 +3385,9 @@ bool mlo_construct_BwdWrW2D::mloIsFastAsmDirect3x3WrW() const
// They work fine on gfx8
// /todo fix memory faults on gfx9
const std::string name = _stream->GetDeviceName();
return !(name == "gfx900" && (_in_width == 13 || _in_width == 27 || _in_width == 54 ||
_in_width == 57 || _in_width == 17 || _in_width == 250));
return !(name == "gfx900" &&
(_in_width == 13 || _in_width == 27 || _in_width == 54 || _in_width == 57 ||
_in_width == 17 || _in_width == 250 || _in_width == 175));
}

int mlo_construct_BwdWrW2D::mloConstructAsmDirect3x3WrW()
Expand Down

0 comments on commit 3ad1114

Please sign in to comment.