diff --git a/CMakeLists.txt b/CMakeLists.txt index 85ada5200..2f6e7f406 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -185,6 +185,10 @@ if (WITH_CUDA) message(STATUS "Found TensorRT include directory: ${TENSORRT_INCLUDE_DIR}") endif() + # TensorRT 6 header generates a lot of deprecating warnings. Ignore them. + set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -isystem ${TENSORRT_INCLUDE_DIR}") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -isystem ${TENSORRT_INCLUDE_DIR}") + find_path(CUB_INCLUDE_DIR NAMES cub/cub.cuh) if(NOT CUB_INCLUDE_DIR) message(FATAL_ERROR "CUB library not found") @@ -201,11 +205,8 @@ if (WITH_CUDA) message(STATUS "Found Thrust include directory: ${THRUST_INCLUDE_DIR}") endif() - list(APPEND INCLUDE_DIRECTORIES ${TENSORRT_INCLUDE_DIR}) cuda_include_directories( - ${INCLUDE_DIRECTORIES} ${CUB_INCLUDE_DIR} - ${THRUST_INCLUDE_DIR} ) cuda_add_library(${PROJECT_NAME} ${SOURCES} diff --git a/README.md b/README.md index 216744e04..23e0912a4 100644 --- a/README.md +++ b/README.md @@ -47,7 +47,7 @@ CTranslate2 uses the following libraries for acceleration: * [Intel MKL-DNN](https://github.com/intel/mkl-dnn) (>=0.20,<1.0) * GPU * [CUB](https://nvlabs.github.io/cub/) (>=1.8.0) - * [TensorRT](https://developer.nvidia.com/tensorrt) (==5.*) + * [TensorRT](https://developer.nvidia.com/tensorrt) (==6.*) * [Thrust](https://docs.nvidia.com/cuda/thrust/index.html) (==1.9.3, included in CUDA 10.0) * [cuBLAS](https://developer.nvidia.com/cublas) (with CUDA>=10.0) * [cuDNN](https://developer.nvidia.com/cudnn) (>=7.5) diff --git a/docker/Dockerfile.centos7-gpu b/docker/Dockerfile.centos7-gpu index 736ccdeb1..ca2531f0b 100644 --- a/docker/Dockerfile.centos7-gpu +++ b/docker/Dockerfile.centos7-gpu @@ -37,8 +37,8 @@ RUN wget https://github.com/intel/mkl-dnn/archive/v$MKLDNN_VERSION.tar.gz && \ make -j4 && make install && \ cd ../.. && rm -r mkl-dnn-* -ENV TENSORRT_MAJOR_VERSION=5 -ENV TENSORRT_VERSION=${TENSORRT_MAJOR_VERSION}.1.5 +ENV TENSORRT_MAJOR_VERSION=6 +ENV TENSORRT_VERSION=${TENSORRT_MAJOR_VERSION}.0.1 RUN curl -fsSL https://developer.download.nvidia.com/compute/machine-learning/repos/rhel7/x86_64/libnvinfer-devel-${TENSORRT_VERSION}-1.cuda10.0.x86_64.rpm -O && \ curl -fsSL https://developer.download.nvidia.com/compute/machine-learning/repos/rhel7/x86_64/libnvinfer${TENSORRT_MAJOR_VERSION}-${TENSORRT_VERSION}-1.cuda10.0.x86_64.rpm -O && \ rpm -ivh --nodeps libnvinfer*.rpm && \ diff --git a/docker/Dockerfile.ubuntu-gpu b/docker/Dockerfile.ubuntu-gpu index 75ff468fd..acdaf497f 100644 --- a/docker/Dockerfile.ubuntu-gpu +++ b/docker/Dockerfile.ubuntu-gpu @@ -1,8 +1,8 @@ ARG UBUNTU_VERSION=18.04 FROM nvidia/cuda:10.0-cudnn7-devel-ubuntu${UBUNTU_VERSION} as builder -ENV TENSORRT_MAJOR_VERSION=5 -ENV TENSORRT_VERSION=${TENSORRT_MAJOR_VERSION}.1.5 +ENV TENSORRT_MAJOR_VERSION=6 +ENV TENSORRT_VERSION=${TENSORRT_MAJOR_VERSION}.0.1 RUN apt-get update && \ apt-get install -y --no-install-recommends \ diff --git a/src/cuda/utils.cc b/src/cuda/utils.cc index 131c29df3..0dbf2fef7 100644 --- a/src/cuda/utils.cc +++ b/src/cuda/utils.cc @@ -140,14 +140,10 @@ namespace ctranslate2 { } g_allocator; - static int max_batch_size = 512; - static nvinfer1::IBuilder* get_trt_builder() { static thread_local nvinfer1::IBuilder* builder = nullptr; if (!builder) { builder = nvinfer1::createInferBuilder(g_logger); - builder->setMaxBatchSize(max_batch_size); - builder->setMaxWorkspaceSize(1 << 30); builder->setGpuAllocator(&g_allocator); } return builder; @@ -162,42 +158,34 @@ namespace ctranslate2 { } TensorRTLayer::~TensorRTLayer() { - clear(); - } - - void TensorRTLayer::build(bool force) { - if (!_network || force) { - clear(); - auto builder = get_trt_builder(); - _network = builder->createNetwork(); - build_network(_network); - _engine = builder->buildCudaEngine(*_network); - _execution_context = _engine->createExecutionContext(); - } - } - - void TensorRTLayer::clear() { - if (_network) { - _network->destroy(); - _network = nullptr; - } - if (_engine) { - _engine->destroy(); - _engine = nullptr; - } if (_execution_context) { _execution_context->destroy(); - _execution_context = nullptr; + _network->destroy(); + _engine->destroy(); + _builder_config->destroy(); } } - void TensorRTLayer::run(int batch_size, void** bindings) { - if (batch_size > max_batch_size) - throw std::runtime_error("Maximum batch size supported by the TensorRT engine is " - + std::to_string(max_batch_size) + ", but got " - + std::to_string(batch_size)); - build(); - _execution_context->enqueue(batch_size, bindings, get_cuda_stream(), nullptr); + void TensorRTLayer::build() { + auto builder = get_trt_builder(); + _network = builder->createNetworkV2( + 1U << static_cast(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH)); + build_network(_network); + auto profile = builder->createOptimizationProfile(); + set_optimization_profile(profile); + _builder_config = builder->createBuilderConfig(); + _builder_config->setMaxWorkspaceSize(1 << 30); + _builder_config->addOptimizationProfile(profile); + _engine = builder->buildEngineWithConfig(*_network, *_builder_config); + _execution_context = _engine->createExecutionContext(); + } + + void TensorRTLayer::run(void** bindings, const std::vector& input_dims) { + if (!_execution_context) + build(); + for (size_t i = 0; i < input_dims.size(); ++i) + _execution_context->setBindingDimensions(i, input_dims[i]); + _execution_context->enqueueV2(bindings, get_cuda_stream(), nullptr); } } diff --git a/src/cuda/utils.h b/src/cuda/utils.h index dfaeb6eb8..d92f61811 100644 --- a/src/cuda/utils.h +++ b/src/cuda/utils.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include #include @@ -49,15 +50,18 @@ namespace ctranslate2 { virtual ~TensorRTLayer(); protected: + void run(void** bindings, const std::vector& input_dims); + + // These methods are called on the first call to run(). virtual void build_network(nvinfer1::INetworkDefinition* network) = 0; - void run(int batch_size, void** bindings); - void build(bool force = false); - void clear(); + virtual void set_optimization_profile(nvinfer1::IOptimizationProfile* profile) = 0; private: + void build(); nvinfer1::INetworkDefinition* _network = nullptr; nvinfer1::ICudaEngine* _engine = nullptr; nvinfer1::IExecutionContext* _execution_context = nullptr; + nvinfer1::IBuilderConfig* _builder_config = nullptr; }; // Statically assiocate cudnnDataType_t with a C++ type. diff --git a/src/ops/topk_gpu.cu b/src/ops/topk_gpu.cu index d3d756292..a5a7d2f69 100644 --- a/src/ops/topk_gpu.cu +++ b/src/ops/topk_gpu.cu @@ -9,17 +9,15 @@ namespace ctranslate2 { public: TopKLayer(int k) : _k(k) - , _depth(0) { + , _first_depth(0) { } void operator()(const StorageView& x, StorageView& values, StorageView& indices) { int depth = x.dim(-1); int batch_size = x.size() / depth; - if (depth != _depth) { - _depth = depth; - build(/*force=*/true); - } + if (_first_depth == 0) + _first_depth = depth; void* bindings[3] = { const_cast(x.data()), @@ -27,14 +25,15 @@ namespace ctranslate2 { indices.data() }; - run(batch_size, bindings); + run(bindings, {nvinfer1::Dims2(batch_size, depth)}); } protected: void build_network(nvinfer1::INetworkDefinition* network) override { - nvinfer1::Dims input_dim{1, {_depth}, {nvinfer1::DimensionType::kCHANNEL}}; - nvinfer1::ITensor* input = network->addInput("x", nvinfer1::DataType::kFLOAT, input_dim); - nvinfer1::ITopKLayer* topk = network->addTopK(*input, nvinfer1::TopKOperation::kMAX, _k, 1); + nvinfer1::ITensor* input = network->addInput("x", + nvinfer1::DataType::kFLOAT, + nvinfer1::Dims2(-1, -1)); + nvinfer1::ITopKLayer* topk = network->addTopK(*input, nvinfer1::TopKOperation::kMAX, _k, 2); nvinfer1::ITensor* values_t = topk->getOutput(0); nvinfer1::ITensor* indices_t = topk->getOutput(1); network->markOutput(*values_t); @@ -44,9 +43,23 @@ namespace ctranslate2 { indices_t->setType(nvinfer1::DataType::kINT32); } + void set_optimization_profile(nvinfer1::IOptimizationProfile* profile) override { + // Optimize for the first seen depth which covers the standard use case + // of running TopK over a static vocabulary size. + profile->setDimensions("x", + nvinfer1::OptProfileSelector::kMIN, + nvinfer1::Dims2(1, _first_depth)); + profile->setDimensions("x", + nvinfer1::OptProfileSelector::kOPT, + nvinfer1::Dims2(64, _first_depth)); + profile->setDimensions("x", + nvinfer1::OptProfileSelector::kMAX, + nvinfer1::Dims2(1024, _first_depth)); + } + private: int _k; - int _depth; + int _first_depth; }; template