arm_compute v18.11

ARM-software · Nov 22, 2018 · b9abeae · b9abeae
1 parent 52ba29e
commit b9abeae
Show file tree

Hide file tree

Showing 12,114 changed files with 714,002 additions and 427,236 deletions.
diff --git a/README.md b/README.md
@@ -1,30 +1,22 @@
-
+Release repository: https://github.com/arm-software/ComputeLibrary
+Development repository: https://review.mlplatform.org/#/admin/projects/ml/ComputeLibrary
 Please report issues here: https://github.com/ARM-software/ComputeLibrary/issues
 **Make sure you are using the latest version of the library before opening an issue. Thanks**
 
 News:
 
-- We're hiring: Staff Machine Learning C++ Software Engineer in Cambridge (UK)
-    - Required skills:
-        - Proficient in C++11.
-    - Preferred skills:
-        - Some SIMD (Preferably NEON and/or OpenCL) experience
-        - Some machine learning / computer vision knowledge
-        - Familiarity in developing compute-intensive applications and ideally industry experience of product development
-        - Experience programming in assembly language.
-
-    Interested ? Contact us: [email protected]
 - [Gian Marco's talk on optimizing CNNs with Winograd algorithms at the EVS](https://www.embedded-vision.com/platinum-members/arm/embedded-vision-training/videos/pages/may-2018-embedded-vision-summit-iodice)
+- [Gian Marco's talk on using SGEMM and FFTs to Accelerate Deep Learning](https://www.embedded-vision.com/platinum-members/arm/embedded-vision-training/videos/pages/may-2016-embedded-vision-summit-iodice)
 
 Related projects:
 
 - [Arm NN SDK](https://github.com/arm-software/armnn)
-- [Caffe on Compute Library](https://github.com/OAID/Caffe-HRT)
 - [Tutorial: Cartoonifying Images on Raspberry Pi with the Compute Library](https://community.arm.com/graphics/b/blog/posts/cartoonifying-images-on-raspberry-pi-with-the-compute-library)
 - [Tutorial: Running AlexNet on Raspberry Pi with Compute Library](https://community.arm.com/processors/b/blog/posts/running-alexnet-on-raspberry-pi-with-compute-library)
 
 Documentation available here:
 
+- [v18.11](https://arm-software.github.io/ComputeLibrary/v18.11/)
 - [v18.08](https://arm-software.github.io/ComputeLibrary/v18.08/)
 - [v18.05](https://arm-software.github.io/ComputeLibrary/v18.05/)
 - [v18.03](https://arm-software.github.io/ComputeLibrary/v18.03/)
@@ -40,6 +32,8 @@ Documentation available here:
 
 Binaries available here:
 
+- [v18.11-linux](https://github.com/ARM-software/ComputeLibrary/releases/download/v18.08/arm_compute-v18.11-bin-linux.tar.gz)
+- [v18.11-android](https://github.com/ARM-software/ComputeLibrary/releases/download/v18.08/arm_compute-v18.11-bin-android.tar.gz)
 - [v18.08-linux](https://github.com/ARM-software/ComputeLibrary/releases/download/v18.08/arm_compute-v18.08-bin-linux.tar.gz)
 - [v18.08-android](https://github.com/ARM-software/ComputeLibrary/releases/download/v18.08/arm_compute-v18.08-bin-android.tar.gz)
 - [v18.05-linux](https://github.com/ARM-software/ComputeLibrary/releases/download/v18.05/arm_compute-v18.05-bin-linux.tar.gz)
@@ -57,6 +51,6 @@ Binaries available here:
 - [v17.04](https://github.com/ARM-software/ComputeLibrary/releases/download/v17.04/arm_compute-v17.04-bin.tar.gz)
 - [v17.03.1](https://github.com/ARM-software/ComputeLibrary/releases/download/v17.03.1/arm_compute-v17.03.1-bin.tar.gz)
 
-Support: [email protected]
+Contact: [email protected]
 
 License & Contributions: The software is provided under MIT license. Contributions to this project are accepted under the same license.
diff --git a/SConscript b/SConscript
@@ -24,11 +24,12 @@ import os.path
 import re
 import subprocess
 
-VERSION = "v18.08"
-SONAME_VERSION="12.0.0"
+VERSION = "v18.11"
+SONAME_VERSION="13.0.0"
 
 Import('env')
 Import('vars')
+Import('install_lib')
 
 def build_library(name, sources, static=False, libs=[]):
     if static:
@@ -53,6 +54,7 @@ def build_library(name, sources, static=False, libs=[]):
         else:
             obj = arm_compute_env.SharedLibrary(name, source=sources, LIBS = arm_compute_env["LIBS"] + libs)
 
+    obj = install_lib(obj)
     Default(obj)
     return obj
 
@@ -208,6 +210,8 @@ if env['neon']:
 
     if "arm64-v8" in env['arch']:
         core_files += Glob('src/core/NEON/kernels/arm_gemm/kernels/a64_*/*.cpp')
+        if "sve" in env['arch']:
+             core_files += Glob('src/core/NEON/kernels/arm_gemm/kernels/sve_*/*.cpp')
 
     runtime_files += Glob('src/runtime/NEON/*.cpp')
     runtime_files += Glob('src/runtime/NEON/functions/*.cpp')

diff --git a/SConstruct b/SConstruct
@@ -40,7 +40,7 @@ vars.AddVariables(
     BoolVariable("debug", "Debug", False),
     BoolVariable("asserts", "Enable asserts (this flag is forced to 1 for debug=1)", False),
     BoolVariable("logging", "Logging (this flag is forced to 1 for debug=1)", False),
-    EnumVariable("arch", "Target Architecture", "armv7a", allowed_values=("armv7a", "arm64-v8a", "arm64-v8.2-a", "x86_32", "x86_64")),
+    EnumVariable("arch", "Target Architecture", "armv7a", allowed_values=("armv7a", "arm64-v8a", "arm64-v8.2-a", "arm64-v8.2-a-sve", "x86_32", "x86_64")),
     EnumVariable("os", "Target OS", "linux", allowed_values=("linux", "android", "bare_metal")),
     EnumVariable("build", "Build type", "cross_compile", allowed_values=("native", "cross_compile", "embed_only")),
     BoolVariable("examples", "Build example programs", True),
@@ -54,21 +54,52 @@ vars.AddVariables(
     BoolVariable("openmp", "Enable OpenMP backend", False),
     BoolVariable("cppthreads", "Enable C++11 threads backend", True),
     PathVariable("build_dir", "Specify sub-folder for the build", ".", PathVariable.PathAccept),
+    PathVariable("install_dir", "Specify sub-folder for the install", "", PathVariable.PathAccept),
     ("extra_cxx_flags", "Extra CXX flags to be appended to the build command", ""),
+    ("extra_link_flags", "Extra LD flags to be appended to the build command", ""),
     ("compiler_cache", "Command to prefix to the C and C++ compiler (e.g ccache)", "")
 )
 
 env = Environment(platform="posix", variables=vars, ENV = os.environ)
-env.Append(LIBPATH = ["#build/%s" % env['build_dir']])
+build_path = env['build_dir']
+# If build_dir is a relative path then add a #build/ prefix:
+if not env['build_dir'].startswith('/'):
+    SConsignFile('build/%s/.scons' % build_path)
+    build_path = "#build/%s" % build_path
+else:
+    SConsignFile('%s/.scons' % build_path)
+
+install_path = env['install_dir']
+#If the install_dir is a relative path then assume it's from inside build_dir
+if not env['install_dir'].startswith('/') and install_path != "":
+    install_path = "%s/%s" % (build_path, install_path)
+
+env.Append(LIBPATH = [build_path])
 Export('env')
 Export('vars')
 
-SConsignFile('build/.%s' % env['build_dir'])
+def install_lib( lib ):
+    # If there is no install folder, then there is nothing to do:
+    if install_path == "":
+        return lib
+    return env.Install( "%s/lib/" % install_path, lib)
+def install_bin( bin ):
+    # If there is no install folder, then there is nothing to do:
+    if install_path == "":
+        return bin
+    return env.Install( "%s/bin/" % install_path, bin)
+def install_include( inc ):
+    if install_path == "":
+        return inc
+    return env.Install( "%s/include/" % install_path, inc)
+
+Export('install_lib')
+Export('install_bin')
 
 Help(vars.GenerateHelpText(env))
 
 if env['build'] == "embed_only":
-    SConscript('./SConscript', variant_dir='#build/%s' % env['build_dir'], duplicate=0)
+    SConscript('./SConscript', variant_dir=build_path, duplicate=0)
     Return()
 
 if env['neon'] and 'x86' in env['arch']:
@@ -142,17 +173,23 @@ elif env['arch'] == 'arm64-v8a':
         prefix = "aarch64-linux-android-"
     if 'clang++' in cpp_compiler:
         env.Append(CXXFLAGS = ['-no-integrated-as'])
-elif env['arch'] == 'arm64-v8.2-a':
-    env.Append(CXXFLAGS = ['-march=armv8.2-a+fp16']) # explicitly enable fp16 extension otherwise __ARM_FEATURE_FP16_VECTOR_ARITHMETIC is undefined
+elif 'arm64-v8.2-a' in env['arch']:
+    if env['arch'] == 'arm64-v8.2-a-sve':
+        if env['os'] != 'bare_metal':
+            print("Only bare metal SVE is supported at the moment")
+            Exit(1)
+        env.Append(CXXFLAGS = ['-march=armv8.2-a+sve+fp16+dotprod'])
+    else:
+        env.Append(CXXFLAGS = ['-march=armv8.2-a+fp16']) # explicitly enable fp16 extension otherwise __ARM_FEATURE_FP16_VECTOR_ARITHMETIC is undefined
+        if env['os'] == 'linux':
+            prefix = "aarch64-linux-gnu-"
+        elif env['os'] == 'bare_metal':
+            prefix = "aarch64-elf-"
+        elif env['os'] == 'android':
+            prefix = "aarch64-linux-android-"
     env.Append(CPPDEFINES = ['ARM_COMPUTE_AARCH64_V8_2','NO_DOT_IN_TOOLCHAIN'])
     if 'clang++' in cpp_compiler:
         env.Append(CXXFLAGS = ['-no-integrated-as'])
-    if env['os'] == 'linux':
-        prefix = "aarch64-linux-gnu-"
-    elif env['os'] == 'bare_metal':
-        prefix = "aarch64-elf-"
-    elif env['os'] == 'android':
-        prefix = "aarch64-linux-android-"
 elif env['arch'] == 'x86_32':
     env.Append(CCFLAGS = ['-m32'])
     env.Append(LINKFLAGS = ['-m32'])
@@ -242,20 +279,24 @@ if env['logging']:
 
 env.Append(CPPPATH = ['#/include', "#"])
 env.Append(CXXFLAGS = env['extra_cxx_flags'])
+env.Append(LINKFLAGS = env['extra_link_flags'])
+
+Default( install_include("arm_compute"))
+Default( install_include("support"))
 
 Export('version_at_least')
 
 if env['opencl']:
-    SConscript("./opencl-1.2-stubs/SConscript", variant_dir="build/%s/opencl-1.2-stubs" % env['build_dir'], duplicate=0)
+    SConscript("./opencl-1.2-stubs/SConscript", variant_dir="%s/opencl-1.2-stubs" % build_path, duplicate=0)
 
 if env['gles_compute'] and env['os'] != 'android':
     env.Append(CPPPATH = ['#/include/linux'])
-    SConscript("./opengles-3.1-stubs/SConscript", variant_dir="build/%s/opengles-3.1-stubs" % env['build_dir'], duplicate=0)
+    SConscript("./opengles-3.1-stubs/SConscript", variant_dir="%s/opengles-3.1-stubs" % build_path, duplicate=0)
 
-SConscript('./SConscript', variant_dir='#build/%s' % env['build_dir'], duplicate=0)
+SConscript('./SConscript', variant_dir=build_path, duplicate=0)
 
 if env['examples'] and env['os'] != 'bare_metal':
-    SConscript('./examples/SConscript', variant_dir='#build/%s/examples' % env['build_dir'], duplicate=0)
+    SConscript('./examples/SConscript', variant_dir='%s/examples' % build_path, duplicate=0)
 
 if env['os'] != 'bare_metal':
-    SConscript('./tests/SConscript', variant_dir='#build/%s/tests' % env['build_dir'], duplicate=0)
+    SConscript('./tests/SConscript', variant_dir='%s/tests' % build_path, duplicate=0)
diff --git a/arm_compute/core/CL/CLHelpers.h b/arm_compute/core/CL/CLHelpers.h
@@ -47,6 +47,14 @@ static constexpr unsigned int max_cl_vector_width = 16;
  */
 std::string get_cl_type_from_data_type(const DataType &dt);
 
+/** Translates a tensor data type to the appropriate OpenCL select type.
+ *
+ * @param[in] dt @ref DataType to be translated to OpenCL select type.
+ *
+ * @return The string specifying the OpenCL select type to be used.
+ */
+std::string get_cl_select_type_from_data_type(const DataType &dt);
+
 /** Get the size of a data type in number of bits.
  *
  * @param[in] dt @ref DataType.

diff --git a/arm_compute/core/CL/CLKernels.h b/arm_compute/core/CL/CLKernels.h
@@ -32,10 +32,12 @@
 #include "arm_compute/core/CL/kernels/CLArithmeticDivisionKernel.h"
 #include "arm_compute/core/CL/kernels/CLArithmeticSubtractionKernel.h"
 #include "arm_compute/core/CL/kernels/CLBatchNormalizationLayerKernel.h"
+#include "arm_compute/core/CL/kernels/CLBatchToSpaceLayerKernel.h"
 #include "arm_compute/core/CL/kernels/CLBitwiseAndKernel.h"
 #include "arm_compute/core/CL/kernels/CLBitwiseNotKernel.h"
 #include "arm_compute/core/CL/kernels/CLBitwiseOrKernel.h"
 #include "arm_compute/core/CL/kernels/CLBitwiseXorKernel.h"
+#include "arm_compute/core/CL/kernels/CLBoundingBoxTransformKernel.h"
 #include "arm_compute/core/CL/kernels/CLBox3x3Kernel.h"
 #include "arm_compute/core/CL/kernels/CLCannyEdgeKernel.h"
 #include "arm_compute/core/CL/kernels/CLChannelCombineKernel.h"
@@ -64,10 +66,13 @@
 #include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
 #include "arm_compute/core/CL/kernels/CLFlattenLayerKernel.h"
 #include "arm_compute/core/CL/kernels/CLFloorKernel.h"
+#include "arm_compute/core/CL/kernels/CLFuseBatchNormalizationKernel.h"
 #include "arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h"
 #include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h"
 #include "arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h"
+#include "arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h"
 #include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h"
+#include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloatKernel.h"
 #include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h"
 #include "arm_compute/core/CL/kernels/CLGEMMLowpReductionKernel.h"
 #include "arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h"
@@ -78,6 +83,7 @@
 #include "arm_compute/core/CL/kernels/CLGaussian3x3Kernel.h"
 #include "arm_compute/core/CL/kernels/CLGaussian5x5Kernel.h"
 #include "arm_compute/core/CL/kernels/CLGaussianPyramidKernel.h"
+#include "arm_compute/core/CL/kernels/CLGenerateProposalsLayerKernel.h"
 #include "arm_compute/core/CL/kernels/CLHOGDescriptorKernel.h"
 #include "arm_compute/core/CL/kernels/CLHOGDetectorKernel.h"
 #include "arm_compute/core/CL/kernels/CLHarrisCornersKernel.h"
@@ -90,35 +96,46 @@
 #include "arm_compute/core/CL/kernels/CLMagnitudePhaseKernel.h"
 #include "arm_compute/core/CL/kernels/CLMeanStdDevKernel.h"
 #include "arm_compute/core/CL/kernels/CLMedian3x3Kernel.h"
+#include "arm_compute/core/CL/kernels/CLMemsetKernel.h"
 #include "arm_compute/core/CL/kernels/CLMinMaxLayerKernel.h"
 #include "arm_compute/core/CL/kernels/CLMinMaxLocationKernel.h"
 #include "arm_compute/core/CL/kernels/CLNonLinearFilterKernel.h"
 #include "arm_compute/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.h"
 #include "arm_compute/core/CL/kernels/CLNormalizationLayerKernel.h"
+#include "arm_compute/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h"
 #include "arm_compute/core/CL/kernels/CLPermuteKernel.h"
 #include "arm_compute/core/CL/kernels/CLPixelWiseMultiplicationKernel.h"
 #include "arm_compute/core/CL/kernels/CLPoolingLayerKernel.h"
+#include "arm_compute/core/CL/kernels/CLPriorBoxLayerKernel.h"
 #include "arm_compute/core/CL/kernels/CLQuantizationLayerKernel.h"
+#include "arm_compute/core/CL/kernels/CLROIAlignLayerKernel.h"
 #include "arm_compute/core/CL/kernels/CLROIPoolingLayerKernel.h"
 #include "arm_compute/core/CL/kernels/CLReductionOperationKernel.h"
 #include "arm_compute/core/CL/kernels/CLRemapKernel.h"
+#include "arm_compute/core/CL/kernels/CLReorgLayerKernel.h"
 #include "arm_compute/core/CL/kernels/CLReshapeLayerKernel.h"
 #include "arm_compute/core/CL/kernels/CLScaleKernel.h"
 #include "arm_compute/core/CL/kernels/CLScharr3x3Kernel.h"
 #include "arm_compute/core/CL/kernels/CLSobel3x3Kernel.h"
 #include "arm_compute/core/CL/kernels/CLSobel5x5Kernel.h"
 #include "arm_compute/core/CL/kernels/CLSobel7x7Kernel.h"
 #include "arm_compute/core/CL/kernels/CLSoftmaxLayerKernel.h"
+#include "arm_compute/core/CL/kernels/CLSpaceToBatchLayerKernel.h"
+#include "arm_compute/core/CL/kernels/CLStridedSliceKernel.h"
 #include "arm_compute/core/CL/kernels/CLTableLookupKernel.h"
 #include "arm_compute/core/CL/kernels/CLThresholdKernel.h"
 #include "arm_compute/core/CL/kernels/CLTransposeKernel.h"
+#include "arm_compute/core/CL/kernels/CLUpsampleLayerKernel.h"
 #include "arm_compute/core/CL/kernels/CLWarpAffineKernel.h"
 #include "arm_compute/core/CL/kernels/CLWarpPerspectiveKernel.h"
 #include "arm_compute/core/CL/kernels/CLWeightsReshapeKernel.h"
+#include "arm_compute/core/CL/kernels/CLWidthConcatenate2TensorsKernel.h"
+#include "arm_compute/core/CL/kernels/CLWidthConcatenate4TensorsKernel.h"
 #include "arm_compute/core/CL/kernels/CLWidthConcatenateLayerKernel.h"
 #include "arm_compute/core/CL/kernels/CLWinogradFilterTransformKernel.h"
 #include "arm_compute/core/CL/kernels/CLWinogradInputTransformKernel.h"
 #include "arm_compute/core/CL/kernels/CLWinogradOutputTransformKernel.h"
+#include "arm_compute/core/CL/kernels/CLYOLOLayerKernel.h"
 #include "arm_compute/core/CL/kernels/ICLDepthwiseConvolutionLayer3x3Kernel.h"
 
 #endif /* __ARM_COMPUTE_CLKERNELS_H__ */
diff --git a/arm_compute/core/CL/OpenCL.h b/arm_compute/core/CL/OpenCL.h
@@ -37,6 +37,9 @@
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Weffc++"
 #pragma GCC diagnostic ignored "-Wignored-qualifiers"
+#if defined(__GNUG__) && __GNUG__ >= 8
+#pragma GCC diagnostic ignored "-Wcatch-value"
+#endif // defined(__GNUG__) && __GNUG__ >= 8
 #include <CL/cl2.hpp>
 #pragma GCC diagnostic pop
 
@@ -114,6 +117,7 @@ class CLSymbols final
     DECLARE_FUNCTION_PTR(clReleaseMemObject);
     DECLARE_FUNCTION_PTR(clGetDeviceInfo);
     DECLARE_FUNCTION_PTR(clGetDeviceIDs);
+    DECLARE_FUNCTION_PTR(clGetMemObjectInfo);
     DECLARE_FUNCTION_PTR(clRetainEvent);
     DECLARE_FUNCTION_PTR(clGetPlatformIDs);
     DECLARE_FUNCTION_PTR(clGetKernelWorkGroupInfo);

diff --git a/arm_compute/core/CL/kernels/CLArithmeticAdditionKernel.h b/arm_compute/core/CL/kernels/CLArithmeticAdditionKernel.h
@@ -51,7 +51,7 @@ class CLArithmeticAdditionKernel : public ICLKernel
     CLArithmeticAdditionKernel &operator=(CLArithmeticAdditionKernel &&) = default;
     /** Default destructor */
     ~CLArithmeticAdditionKernel() = default;
-    /** Initialise the kernel's inputs, output and convertion policy.
+    /** Initialise the kernel's inputs, output and conversion policy.
      *
      * @param[in]  input1 First tensor input. Data types supported: U8/QASYMM8/S16/F16/F32.
      * @param[in]  input2 Second tensor input. Data types supported: U8, QASYMM8 (only if @p input1 is QASYMM8), S16/F16/F32.

diff --git a/arm_compute/core/CL/kernels/CLArithmeticSubtractionKernel.h b/arm_compute/core/CL/kernels/CLArithmeticSubtractionKernel.h
@@ -53,19 +53,19 @@ class CLArithmeticSubtractionKernel : public ICLKernel
     /** Default destructor */
     ~CLArithmeticSubtractionKernel() = default;
 
-    /** Initialise the kernel's inputs, output and convertion policy.
+    /** Initialise the kernel's inputs, output and conversion policy.
      *
-     * @param[in]  input1 First tensor input. Data types supported: U8/S16/F16/F32.
-     * @param[in]  input2 Second tensor input. Data types supported: U8/S16/F16/F32.
-     * @param[out] output Output tensor. Data types supported: U8 (Only if both inputs are U8), S16/F16/F32.
+     * @param[in]  input1 First tensor input. Data types supported: U8/QASYMM8/S16/F16/F32.
+     * @param[in]  input2 Second tensor input. Data types supported: U8/QASYMM8/S16/F16/F32.
+     * @param[out] output Output tensor. Data types supported: U8 (Only if both inputs are U8), QASYMM8/S16/F16/F32.
      * @param[in]  policy Policy to use to handle overflow.
      */
     void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy);
     /** Static function to check if given info will lead to a valid configuration of @ref CLArithmeticSubtractionKernel
      *
-     * @param[in] input1 First tensor input info. Data types supported: U8/S16/F16/F32.
-     * @param[in] input2 Second tensor input info. Data types supported: U8/S16/F16/F32.
-     * @param[in] output Output tensor info. Data types supported: U8 (Only if both inputs are U8), S16/F16/F32.
+     * @param[in] input1 First tensor input info. Data types supported: U8/QASYMM8/S16/F16/F32.
+     * @param[in] input2 Second tensor input info. Data types supported: U8/QASYMM8/S16/F16/F32.
+     * @param[in] output Output tensor info. Data types supported: U8 (Only if both inputs are U8), QASYMM8/S16/F16/F32.
      * @param[in] policy Policy to use to handle overflow.
      *
      * @return a status
@@ -74,6 +74,7 @@ class CLArithmeticSubtractionKernel : public ICLKernel
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
+    BorderSize border_size() const override;
 
 private:
     const ICLTensor *_input1; /**< Source tensor 1 */