diff --git a/src/plugins/intel_npu/src/backend/include/zero_infer_request.hpp b/src/plugins/intel_npu/src/backend/include/zero_infer_request.hpp index a5a433224d16ed..c53f028d9e82a7 100644 --- a/src/plugins/intel_npu/src/backend/include/zero_infer_request.hpp +++ b/src/plugins/intel_npu/src/backend/include/zero_infer_request.hpp @@ -89,6 +89,7 @@ class ZeroInferRequest final : public SyncInferRequest { std::unique_ptr _pipeline; bool _pipelineIsCreated = false; + bool _externalMemoryStandardAllocationSupported = false; }; } // namespace intel_npu diff --git a/src/plugins/intel_npu/src/backend/include/zero_memory.hpp b/src/plugins/intel_npu/src/backend/include/zero_memory.hpp index 77ba6fa5370f74..867661f189a116 100644 --- a/src/plugins/intel_npu/src/backend/include/zero_memory.hpp +++ b/src/plugins/intel_npu/src/backend/include/zero_memory.hpp @@ -12,6 +12,7 @@ #include "intel_npu/utils/logger/logger.hpp" #include "intel_npu/utils/zero/zero_init.hpp" +#include "openvino/runtime/itensor.hpp" namespace { @@ -23,7 +24,7 @@ namespace intel_npu { namespace zeroMemory { // Create an allocator that uses the ov::Allocator signature that will be used to create the tensor. -class HostMemAllocator final { +class HostMemAllocator { public: explicit HostMemAllocator(const std::shared_ptr& initStructs, ze_host_mem_alloc_flag_t flag = {}) @@ -36,7 +37,7 @@ class HostMemAllocator final { * @param bytes The size in bytes to allocate * @return Handle to the allocated resource */ - void* allocate(const size_t bytes, const size_t alignment = STANDARD_PAGE_SIZE) noexcept; + virtual void* allocate(const size_t bytes, const size_t alignment = STANDARD_PAGE_SIZE) noexcept; /** * @brief Releases handle and all associated memory resources which invalidates the handle. @@ -47,7 +48,7 @@ class HostMemAllocator final { bool is_equal(const HostMemAllocator& other) const; -private: +protected: const std::shared_ptr _initStructs; Logger _logger; @@ -56,5 +57,19 @@ class HostMemAllocator final { static const std::size_t _alignment = STANDARD_PAGE_SIZE; }; +class HostMemSharedAllocator final : public HostMemAllocator { +public: + explicit HostMemSharedAllocator(const std::shared_ptr& initStructs, + const std::shared_ptr& tensor, + ze_host_mem_alloc_flag_t flag = {}) + : HostMemAllocator(initStructs, flag), + _tensor(tensor) {} + + void* allocate(const size_t bytes = 0, const size_t alignment = STANDARD_PAGE_SIZE) noexcept override; + +private: + const std::shared_ptr _tensor; +}; + } // namespace zeroMemory } // namespace intel_npu diff --git a/src/plugins/intel_npu/src/backend/src/zero_infer_request.cpp b/src/plugins/intel_npu/src/backend/src/zero_infer_request.cpp index 4a5e77820e3b70..c37f5dad95b3e9 100644 --- a/src/plugins/intel_npu/src/backend/src/zero_infer_request.cpp +++ b/src/plugins/intel_npu/src/backend/src/zero_infer_request.cpp @@ -4,6 +4,10 @@ #include "zero_infer_request.hpp" +#include + +#include + #include "intel_npu/common/itt.hpp" #include "intel_npu/config/options.hpp" #include "intel_npu/prefix.hpp" @@ -62,6 +66,13 @@ void check_level_zero_attributes_match(const IODescriptor& ioDescriptor, const A } } +bool memory_aligned_to_standard_page_size(void* addr) { + auto addr_int = reinterpret_cast(addr); + + // addr is aligned to standard page size + return (addr_int & 0xFFF) == 0; +} + } // namespace //------------------------------------------------------------------------------ @@ -79,6 +90,15 @@ ZeroInferRequest::ZeroInferRequest(const std::shared_ptr& _levelZeroOutputTensors(_metadata.outputs.size(), nullptr) { _logger.debug("ZeroInferRequest::ZeroInferRequest - SyncInferRequest"); + ze_device_external_memory_properties_t desc = {}; + desc.stype = ZE_STRUCTURE_TYPE_DEVICE_EXTERNAL_MEMORY_PROPERTIES; + auto res = zeDeviceGetExternalMemoryProperties(_initStructs->getDevice(), &desc); + if (res == ZE_RESULT_SUCCESS) { + if (desc.memoryAllocationImportTypes & ZE_EXTERNAL_MEMORY_TYPE_FLAG_STANDARD_ALLOCATION) { + _externalMemoryStandardAllocationSupported = true; + } + } + _outputAllocator = std::make_shared(_initStructs); _inputAllocator = std::make_shared(_initStructs, ZE_HOST_MEM_ALLOC_FLAG_BIAS_WRITE_COMBINED); @@ -215,18 +235,33 @@ void ZeroInferRequest::set_tensor_data(const std::shared_ptr& tenso levelZeroTensors = tensor; updateCommandListArg = true; } else { - auto zeroTensor = std::dynamic_pointer_cast(levelZeroTensors); - if (zeroTensor == nullptr || (zeroTensor != nullptr && zeroTensor->tensor_was_shared_with_user())) { - _logger.debug("ZeroInferRequest::set_tensor_data - create locally L0 tensor"); - OV_ITT_TASK_NEXT(ZERO_SET_TENSOR, "allocate tensor"); + if (_externalMemoryStandardAllocationSupported && memory_aligned_to_standard_page_size(tensor->data())) { + _logger.debug("ZeroInferRequest::set_tensor_data - import memory from a system memory pointer"); + auto hostMemSharedAllocator = zeroMemory::HostMemSharedAllocator(_initStructs, tensor); + levelZeroTensors = std::make_shared(_initStructs, + _config, + tensor->get_element_type(), + tensor->get_shape(), + hostMemSharedAllocator); - levelZeroTensors = allocate_tensor(isInput ? _metadata.inputs.at(index) : _metadata.outputs.at(index), - index, - isInput, - isInput ? *_inputAllocator : *_outputAllocator, - _graph->get_batch_size()); + std::dynamic_pointer_cast(levelZeroTensors)->set_tensor_shared_with_user(); updateCommandListArg = true; + } else { + auto zeroTensor = std::dynamic_pointer_cast(levelZeroTensors); + + if (zeroTensor == nullptr || (zeroTensor != nullptr && zeroTensor->tensor_was_shared_with_user())) { + _logger.debug("ZeroInferRequest::set_tensor_data - create locally L0 tensor"); + OV_ITT_TASK_NEXT(ZERO_SET_TENSOR, "allocate tensor"); + + levelZeroTensors = allocate_tensor(isInput ? _metadata.inputs.at(index) : _metadata.outputs.at(index), + index, + isInput, + isInput ? *_inputAllocator : *_outputAllocator, + _graph->get_batch_size()); + + updateCommandListArg = true; + } } } @@ -516,6 +551,32 @@ void ZeroInferRequest::update_states_if_memory_changed() { get_level_zero_input(zeroState->get_tensor_index()) = zeroState->get_state()._ptr; _levelZeroOutputTensors.at(zeroState->get_related_tensor_index()) = zeroState->get_state()._ptr; + } else { + if (_externalMemoryStandardAllocationSupported && + memory_aligned_to_standard_page_size(zeroState->get_state()->data())) { + auto hostMemSharedAllocator = + zeroMemory::HostMemSharedAllocator(_initStructs, zeroState->get_state()._ptr); + + get_level_zero_input(zeroState->get_tensor_index()) = + std::make_shared(_initStructs, + _config, + zeroState->get_state()->get_element_type(), + zeroState->get_state()->get_shape(), + hostMemSharedAllocator); + + _levelZeroOutputTensors.at(zeroState->get_related_tensor_index()) = + get_level_zero_input(zeroState->get_tensor_index()); + + _pipeline->update_graph_arguments( + _graphInputDescriptors.at(zeroState->get_tensor_index()).idx, + _levelZeroOutputTensors.at(zeroState->get_related_tensor_index())->data(), + _levelZeroOutputTensors.at(zeroState->get_related_tensor_index())->get_byte_size()); + + _pipeline->update_graph_arguments( + _graphOutputDescriptors.at(zeroState->get_related_tensor_index()).idx, + _levelZeroOutputTensors.at(zeroState->get_related_tensor_index())->data(), + _levelZeroOutputTensors.at(zeroState->get_related_tensor_index())->get_byte_size()); + } } } } @@ -616,15 +677,16 @@ void ZeroInferRequest::infer_async() { const auto& levelZeroTensor = get_level_zero_input(inputIndex); if (!is_remote_tensor(levelZeroTensor)) { void* levelZeroBuffer = levelZeroTensor->data(); + if (userBuffer == nullptr || levelZeroBuffer == nullptr) { + OPENVINO_THROW("Empty buffer"); + } - if (userBuffer != levelZeroBuffer) { - if (userBuffer == nullptr || levelZeroBuffer == nullptr) { - OPENVINO_THROW("Empty buffer"); + if (!_externalMemoryStandardAllocationSupported || !memory_aligned_to_standard_page_size(userBuffer)) { + if (userBuffer != levelZeroBuffer) { + _logger.info("Tensor is not allocated in the current Level Zero context"); + OV_ITT_TASK_NEXT(ZERO_INFER, "memcpy"); + std::memcpy(levelZeroBuffer, userBuffer, userTensor.at(SINGLE_TENSOR)->get_byte_size()); } - - _logger.info("Tensor is not allocated in the current Level Zero context"); - OV_ITT_TASK_NEXT(ZERO_INFER, "memcpy"); - std::memcpy(levelZeroBuffer, userBuffer, userTensor.at(SINGLE_TENSOR)->get_byte_size()); } } @@ -665,15 +727,16 @@ void ZeroInferRequest::get_result() { const std::shared_ptr& levelZeroTensor = _levelZeroOutputTensors.at(outputIndex); if (!is_remote_tensor(levelZeroTensor)) { void* levelZeroBuffer = levelZeroTensor->data(); + if (userBuffer == nullptr || levelZeroBuffer == nullptr) { + OPENVINO_THROW("Empty buffer"); + } - if (userBuffer != levelZeroBuffer) { - if (userBuffer == nullptr || levelZeroBuffer == nullptr) { - OPENVINO_THROW("Empty buffer"); + if (!_externalMemoryStandardAllocationSupported || !memory_aligned_to_standard_page_size(userBuffer)) { + if (userBuffer != levelZeroBuffer) { + _logger.info("Tensor is not allocated in the current Level Zero context"); + OV_ITT_TASK_NEXT(ZERO_RESULT, "memcpy"); + std::memcpy(userBuffer, levelZeroBuffer, userTensor->get_byte_size()); } - - _logger.info("Tensor is not allocated in the current Level Zero context"); - OV_ITT_TASK_NEXT(ZERO_RESULT, "memcpy"); - std::memcpy(userBuffer, levelZeroBuffer, userTensor->get_byte_size()); } } diff --git a/src/plugins/intel_npu/src/backend/src/zero_memory.cpp b/src/plugins/intel_npu/src/backend/src/zero_memory.cpp index ef37774aa6995d..ca25ed9df25632 100644 --- a/src/plugins/intel_npu/src/backend/src/zero_memory.cpp +++ b/src/plugins/intel_npu/src/backend/src/zero_memory.cpp @@ -4,6 +4,8 @@ #include "zero_memory.hpp" +#include + #include "intel_npu/utils/zero/zero_api.hpp" #include "intel_npu/utils/zero/zero_result.hpp" #include "intel_npu/utils/zero/zero_utils.hpp" @@ -18,7 +20,7 @@ void* HostMemAllocator::allocate(const size_t bytes, const size_t /*alignment*/) nullptr, static_cast(_flag)}; void* data = nullptr; - ze_result_t result = zeMemAllocHost(_initStructs->getContext(), &desc, size, _alignment, &data); + auto result = zeMemAllocHost(_initStructs->getContext(), &desc, size, _alignment, &data); if (result == ZE_RESULT_SUCCESS) { return data; @@ -46,5 +48,31 @@ bool HostMemAllocator::is_equal(const HostMemAllocator& other) const { return (_initStructs == other._initStructs) && (_flag == other._flag); } +void* HostMemSharedAllocator::allocate(const size_t /*bytes*/, const size_t /*alignment*/) noexcept { + size_t size = (_tensor->get_byte_size() + _alignment - 1) & ~(_alignment - 1); + + _ze_external_memory_import_system_memory_t memory_import = {ZE_STRUCTURE_TYPE_EXTERNAL_MEMORY_IMPORT_SYSTEM_MEMORY, + nullptr, + _tensor->data(), + size}; + + void* data = nullptr; + + ze_host_mem_alloc_desc_t desc = {ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC, + &memory_import, + static_cast(_flag)}; + auto result = zeMemAllocHost(_initStructs->getContext(), &desc, size, _alignment, &data); + + if (result == ZE_RESULT_SUCCESS) { + return data; + } else { + _logger.error("L0 zeMemAllocHost result: %s, code %#X - %s", + ze_result_to_string(result).c_str(), + uint64_t(result), + ze_result_to_description(result).c_str()); + return nullptr; + } +} + } // namespace zeroMemory } // namespace intel_npu diff --git a/src/plugins/intel_npu/tests/functional/behavior/infer_request_run.cpp b/src/plugins/intel_npu/tests/functional/behavior/infer_request_run.cpp index 0cf272fd7b3764..e242eed4d398c4 100644 --- a/src/plugins/intel_npu/tests/functional/behavior/infer_request_run.cpp +++ b/src/plugins/intel_npu/tests/functional/behavior/infer_request_run.cpp @@ -37,6 +37,12 @@ INSTANTIATE_TEST_SUITE_P(smoke_BehaviorTest, ::testing::ValuesIn(configsInferRequestRunTests)), InferRequestRunTests::getTestCaseName); +INSTANTIATE_TEST_SUITE_P(smoke_BehaviorTest, + CpuVaTensorsTests, + ::testing::Combine(::testing::Values(ov::test::utils::DEVICE_NPU), + ::testing::ValuesIn(configsInferRequestRunTests)), + InferRequestRunTests::getTestCaseName); + const std::vector batchingConfigs = { {ov::log::level(ov::log::Level::WARNING), ov::intel_npu::batch_mode(ov::intel_npu::BatchMode::PLUGIN)}, {ov::log::level(ov::log::Level::WARNING), ov::intel_npu::batch_mode(ov::intel_npu::BatchMode::COMPILER)}, diff --git a/src/plugins/intel_npu/tests/functional/behavior/infer_request_run.hpp b/src/plugins/intel_npu/tests/functional/behavior/infer_request_run.hpp index 7cbb5f297de3aa..0b50bc0769c18c 100644 --- a/src/plugins/intel_npu/tests/functional/behavior/infer_request_run.hpp +++ b/src/plugins/intel_npu/tests/functional/behavior/infer_request_run.hpp @@ -9,7 +9,9 @@ #include #include +#include #include +#include #include #include @@ -1234,6 +1236,220 @@ TEST_P(SetShapeInferRunTests, checkResultsAfterStateTensorsReallocation) { } } +using CpuVaTensorsTests = InferRequestRunTests; + +TEST_P(CpuVaTensorsTests, SetMultiplePageAllignedTensors) { + auto shape = Shape{1, 16, 16, 16}; + auto shape_size = ov::shape_size(shape); + auto model = createModel(element::f32, shape, "N..."); + + compiled_model = core->compile_model(model, target_device, configuration); + + const int inferences = 32; + ov::InferRequest inference_request; + ov::Tensor input_tensor; + std::array output_tensor; + + float* input_data; + float* output_data[inferences]; + + const auto input_byte_size = shape_size * sizeof(float); + input_data = static_cast(::operator new(input_byte_size, std::align_val_t(4096))); + for (size_t i = 0; i < shape_size; ++i) { + input_data[i] = 0.f; + } + input_tensor = ov::Tensor{ov::element::f32, shape, input_data}; + + inference_request = compiled_model.create_infer_request(); + + for (int i = 0; i < inferences; i++) { + auto tensor = inference_request.get_output_tensor(0); + const auto byte_size = tensor.get_byte_size(); + + output_data[i] = static_cast(::operator new(byte_size, std::align_val_t(4096))); + output_tensor[i] = ov::Tensor{ov::element::f32, tensor.get_shape(), output_data[i]}; + } + + inference_request.set_input_tensor(input_tensor); + inference_request.set_output_tensor(output_tensor[0]); + + inference_request.infer(); // Adds '1' to each element + + for (int i = 1; i < inferences; i++) { + inference_request.set_output_tensor(output_tensor[i]); + inference_request.set_input_tensor(output_tensor[i - 1]); + + inference_request.infer(); // Adds '1' to each element + } + + float expected_result = 1.f; + + for (int i = 0; i < inferences; i++) { + auto* output_tensor_data = reinterpret_cast(output_tensor[i].data()); + EXPECT_EQ(output_tensor_data, output_data[i]); + for (size_t j = 0; j < shape_size; ++j) { + EXPECT_NEAR(output_tensor_data[j], expected_result, 1e-5) + << "Output=" << i << " Expected=" << expected_result << ", actual=" << output_tensor_data[j] + << " for index " << j; + } + expected_result++; + } + + ::operator delete(input_data, std::align_val_t(4096)); + for (int i = 0; i < inferences; i++) { + ::operator delete(output_data[i], std::align_val_t(4096)); + } +} + +TEST_P(CpuVaTensorsTests, SetMultipleAllignedAndNotAllignedTensors) { + auto shape = Shape{1, 16, 16, 16}; + auto shape_size = ov::shape_size(shape); + auto model = createModel(element::f32, shape, "N..."); + + compiled_model = core->compile_model(model, target_device, configuration); + + const int inferences = 32; + ov::InferRequest inference_request; + ov::Tensor input_tensor; + std::array output_tensor; + + float* input_data; + float* output_data[inferences]; + + const auto input_byte_size = shape_size * sizeof(float); + input_data = static_cast(::operator new(input_byte_size, std::align_val_t(4096))); + input_tensor = ov::Tensor{ov::element::f32, shape, input_data}; + + inference_request = compiled_model.create_infer_request(); + for (int i = 0; i < inferences; i++) { + auto tensor = inference_request.get_output_tensor(0); + const auto byte_size = tensor.get_byte_size(); + + if (i % 2 == 0) { + output_data[i] = static_cast(::operator new(byte_size, std::align_val_t(16))); + } else { + output_data[i] = static_cast(::operator new(byte_size, std::align_val_t(4096))); + } + output_tensor[i] = ov::Tensor{ov::element::f32, tensor.get_shape(), output_data[i]}; + } + + inference_request.set_input_tensor(input_tensor); + inference_request.set_output_tensor(output_tensor[0]); + + for (size_t i = 0; i < shape_size; ++i) { + input_data[i] = 0.f; + } + + inference_request.infer(); // Adds '1' to each element + + for (int i = 1; i < inferences; i++) { + inference_request.set_output_tensor(output_tensor[i]); + inference_request.set_input_tensor(output_tensor[i - 1]); + + inference_request.infer(); // Adds '1' to each element + } + + float expected_result = 1.f; + + for (int i = 0; i < inferences; i++) { + auto* output_tensor_data = reinterpret_cast(output_tensor[i].data()); + for (size_t j = 0; j < shape_size; ++j) { + EXPECT_NEAR(output_tensor_data[j], expected_result, 1e-5) + << "Output=" << i << " Expected=" << expected_result << ", actual=" << output_tensor_data[j] + << " for index " << j; + } + expected_result++; + } + + ::operator delete(input_data, std::align_val_t(4096)); + for (int i = 0; i < inferences; i++) { + if (i % 2 == 0) { + ::operator delete(output_data[i], std::align_val_t(16)); + } else { + ::operator delete(output_data[i], std::align_val_t(4096)); + } + } +} + +TEST_P(CpuVaTensorsTests, SetMultipleRemoteAllignedAndNotAllignedTensors) { + auto shape = Shape{1, 16, 16, 16}; + auto shape_size = ov::shape_size(shape); + auto model = createModel(element::f32, shape, "N..."); + + auto context = core->get_default_context(target_device); + compiled_model = core->compile_model(model, target_device, configuration); + + const int inferences = 32; + ov::InferRequest inference_request; + ov::Tensor input_tensor; + std::array output_tensor; + + float* input_data; + float* output_data[inferences]; + + const auto input_byte_size = shape_size * sizeof(float); + input_data = static_cast(::operator new(input_byte_size, std::align_val_t(16))); + input_tensor = ov::Tensor{ov::element::f32, shape, input_data}; + + inference_request = compiled_model.create_infer_request(); + for (int i = 0; i < inferences; i++) { + auto tensor = inference_request.get_output_tensor(0); + const auto byte_size = tensor.get_byte_size(); + + if (i % 4 == 0) { + output_data[i] = static_cast(::operator new(byte_size, std::align_val_t(16))); + output_tensor[i] = ov::Tensor{ov::element::f32, tensor.get_shape(), output_data[i]}; + } else if (i % 4 == 1) { + output_data[i] = static_cast(::operator new(byte_size, std::align_val_t(4096))); + output_tensor[i] = ov::Tensor{ov::element::f32, tensor.get_shape(), output_data[i]}; + } else if (i % 4 == 2) { + output_tensor[i] = context.create_host_tensor(ov::element::f32, shape); + } else if (i % 4 == 3) { + output_data[i] = static_cast(::operator new(byte_size, std::align_val_t(4096))); + output_tensor[i] = ov::Tensor{ov::element::f32, tensor.get_shape(), output_data[i]}; + } + } + + for (size_t i = 0; i < shape_size; ++i) { + input_data[i] = 0.f; + } + + inference_request.set_input_tensor(input_tensor); + inference_request.set_output_tensor(output_tensor[0]); + + inference_request.infer(); // Adds '1' to each element + + for (int i = 1; i < inferences; i++) { + inference_request.set_output_tensor(output_tensor[i]); + inference_request.set_input_tensor(output_tensor[i - 1]); + + inference_request.infer(); // Adds '1' to each element + } + + float expected_result = 1.f; + + for (int i = 0; i < inferences; i++) { + auto* output_tensor_data = reinterpret_cast(output_tensor[i].data()); + for (size_t j = 0; j < shape_size; ++j) { + EXPECT_NEAR(output_tensor_data[j], expected_result, 1e-5) + << "Output=" << i << " Expected=" << expected_result << ", actual=" << output_tensor_data[j] + << " for index " << j; + } + expected_result++; + } + + ::operator delete(input_data, std::align_val_t(16)); + for (int i = 0; i < inferences; i++) { + if (i % 4 == 0) { + ::operator delete(output_data[i], std::align_val_t(16)); + } else if (i % 4 == 1) { + ::operator delete(output_data[i], std::align_val_t(4096)); + } else if (i % 4 == 3) { + ::operator delete(output_data[i], std::align_val_t(4096)); + } + } +} + } // namespace behavior } // namespace test } // namespace ov diff --git a/src/plugins/intel_npu/thirdparty/level-zero-ext b/src/plugins/intel_npu/thirdparty/level-zero-ext index fd679a50f13884..8cf113bd4a4568 160000 --- a/src/plugins/intel_npu/thirdparty/level-zero-ext +++ b/src/plugins/intel_npu/thirdparty/level-zero-ext @@ -1 +1 @@ -Subproject commit fd679a50f138840633fee8c780dbbbaee971179c +Subproject commit 8cf113bd4a4568f6555d81f316504d7ac3b82ee8