Skip to content

Optimize CPU time spent in inference path #682

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
May 26, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions onnxruntime/core/providers/openvino/backend_utils.cc
Original file line number Diff line number Diff line change
Expand Up @@ -121,15 +121,15 @@ std::istream& operator>>(std::istream& stream, SharedContext::SharedWeights::Met
namespace backend_utils {

bool IsDebugEnabled() {
const std::string env_name = onnxruntime::GetEnvironmentVar("ORT_OPENVINO_ENABLE_DEBUG");

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggestion, can we pull these into the context or subcontext instead of checking all over the place? It's fine if we do this change for not just wondering.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, I don't see a reason why it can't be moved. Though I'll defer that for now since making it static effectively removes the environment var checks anyway.

static std::string env_name = onnxruntime::GetEnvironmentVar("ORT_OPENVINO_ENABLE_DEBUG");
if (!env_name.empty()) {
return true;
}
return false;
}

bool IsCILogEnabled() {
const std::string env_name = onnxruntime::GetEnvironmentVar("ORT_OPENVINO_ENABLE_CI_LOG");
static std::string env_name = onnxruntime::GetEnvironmentVar("ORT_OPENVINO_ENABLE_CI_LOG");
if (!env_name.empty()) {
return true;
}
Expand Down
136 changes: 38 additions & 98 deletions onnxruntime/core/providers/openvino/backends/basic_backend.cc
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,7 @@ BasicBackend::BasicBackend(std::unique_ptr<ONNX_NAMESPACE::ModelProto>& model_pr
};
}
inferRequestsQueue_ = std::unique_ptr<InferRequestsQueue>(new InferRequestsQueue(exe_network_, num_infer_req, std::move(initializer)));
bindings_ = std::make_unique<OnnxToOvNetworkBindings>(exe_network_, subgraph_context_);
}

bool BasicBackend::ValidateSubgraph(std::map<std::string, std::shared_ptr<ov::Node>>& const_outputs_map) {
Expand Down Expand Up @@ -362,29 +363,16 @@ void BasicBackend::SetNumThreads(ov::AnyMap& device_config) {
// an Infer Request indexed by infer_req_idx
void BasicBackend::StartAsyncInference(Ort::KernelContext& context, OVInferRequestPtr infer_request) {
try {
auto ov_input_info = exe_network_.Get().inputs();

// Loop over subgraph original input names to find the correspondent OV input name
for (const auto& [onnx_input_name, onnx_input_index] : subgraph_context_.input_names) {
std::string input_name{};
uint32_t input_idx = 0;
for (uint32_t index = 0; const auto& ov_input : ov_input_info) {
if (ov_input.get_names().contains(onnx_input_name)) {
input_name = onnx_input_name;
input_idx = index;
break;
}
index++;
}
ORT_ENFORCE(!input_name.empty(), log_tag,
"Input names mismatch between OpenVINO and ONNX. ", onnx_input_name,
" doesn't exist in the list of OpenVINO input tensor names");
bool cpu_or_gpu = (session_context_.device_type.find("CPU") != std::string::npos ||

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since I'm spreading my musings here I might as well mention that I'd like to avoid doing these string compare all over the place and have a test for selected devices that can be easily and quickly be tested for.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Absolutely. I was tempted to do that as well, but thinking about the meta devices (auto, multi, hetero, etc) complicated just enough that I didn't want to go down that rabbit hole (yet). 😄

session_context_.device_type.find("GPU") != std::string::npos);
bool npu = (session_context_.device_type.find("NPU") != std::string::npos);

for (const auto& input_info : bindings_->network_inputs_) {
size_t batch_slice_idx = 0;
if (subgraph_context_.has_dynamic_input_shape &&
!session_context_.disable_dynamic_shapes &&
(session_context_.device_type.find("CPU") != std::string::npos ||
session_context_.device_type.find("GPU") != std::string::npos)) {
auto tensor = context.GetInput(subgraph_context_.input_names.at(input_name));
cpu_or_gpu) {
auto tensor = context.GetInput(input_info.onnx_index);
auto tensor_info = tensor.GetTensorTypeAndShapeInfo();
auto tensor_shape = tensor_info.GetShape();
auto tensor_size = tensor_shape.size();
Expand All @@ -395,98 +383,72 @@ void BasicBackend::StartAsyncInference(Ort::KernelContext& context, OVInferReque
input_tensor_shape[tensor_iter] = *i;
tensor_iter += 1;
}
const auto& input = ov_input_info.at(input_idx);
OVTensorPtr tensor_ptr;
// avoid input copies on the CPU device
if (session_context_.device_type.find("CPU") != std::string::npos) {
tensor_ptr = std::make_shared<ov::Tensor>(input.get_element_type(), input_tensor_shape,
tensor_ptr = std::make_shared<ov::Tensor>(input_info.type, input_tensor_shape,
(void*)tensor_data);
} else {
tensor_ptr = std::make_shared<ov::Tensor>(input.get_element_type(), input_tensor_shape);
FillInputBlob(tensor_ptr, batch_slice_idx, input_name, context, subgraph_context_);
tensor_ptr = std::make_shared<ov::Tensor>(input_info.type, input_tensor_shape);
FillInputBlob(tensor_ptr, batch_slice_idx, input_info.name, context, subgraph_context_);
}

try {
infer_request->SetTensor(std::move(input_name), tensor_ptr);
infer_request->SetTensor(input_info.name, tensor_ptr);
} catch (const char* msg) {
ORT_THROW(msg);
}
} else {
if ((session_context_.device_type.find("CPU") != std::string::npos ||
session_context_.device_type.find("GPU") != std::string::npos)) {
if (cpu_or_gpu) {
OVTensorPtr graph_input_blob;
try {
graph_input_blob = infer_request->GetTensor(input_name);
graph_input_blob = infer_request->GetTensor(input_info.name);
} catch (const char* msg) {
ORT_THROW(msg);
}
FillInputBlob(std::move(graph_input_blob), batch_slice_idx, std::move(input_name), context, subgraph_context_);
FillInputBlob(std::move(graph_input_blob), batch_slice_idx, input_info.name, context, subgraph_context_);
} else {
auto tensor = context.GetInput(subgraph_context_.input_names.at(input_name));
ort_tensor_key_t ort_tensor_key{input_name};
auto tensor = context.GetInput(input_info.onnx_index);
ort_tensor_key_t ort_tensor_key{input_info.name};
auto it = ort_ov_tensor_map.find(ort_tensor_key);
if ((it == ort_ov_tensor_map.end()) ||
(it != ort_ov_tensor_map.end() && (it->second.ort_ptr != tensor.GetTensorRawData()))) {
if ((it == ort_ov_tensor_map.end()) || it->second.ort_ptr != tensor.GetTensorRawData()) {
ov_tensor_data_t ov_tensor_data;
const auto& input = ov_input_info.at(input_idx);
ov_tensor_data.tensor_ptr = std::make_shared<ov::Tensor>(input.get_element_type(), input.get_shape(),
ov_tensor_data.tensor_ptr = std::make_shared<ov::Tensor>(input_info.type, input_info.ov_shape.get_shape(),
const_cast<void*>(tensor.GetTensorRawData()));

ov_tensor_data.ort_ptr = tensor.GetTensorRawData();
ort_ov_tensor_map[ort_tensor_key] = ov_tensor_data;

try {
infer_request->SetTensor(std::move(input_name), ov_tensor_data.tensor_ptr);
infer_request->SetTensor(input_info.name, ov_tensor_data.tensor_ptr);
} catch (const char* msg) {
ORT_THROW(msg);
}
}
}
}
} // Loop subgraph original input names
} // Loop subgraph original input

if (session_context_.device_type.find("NPU") != std::string::npos) {
if (npu) {
// Set the output blob as remote blob
auto graph_output_info = exe_network_.Get().outputs();
auto output_idx = 0;
for (auto output_info_iter = graph_output_info.begin();
output_info_iter != graph_output_info.end(); ++output_info_iter) {
auto output_names = output_info_iter->get_names();
std::string onnx_output_name;
std::string output_name;
// using the output name retrieved from ONNX original to match with the output names returned by OV tensors
for (auto it = subgraph_context_.output_names.begin(); it != subgraph_context_.output_names.end(); ++it) {
onnx_output_name = it->first;
if (output_names.find(onnx_output_name) != output_names.end()) {
// Assigning the output_name
output_name = it->first;
break;
}
}
size_t batch_size = 1;
Ort::UnownedValue tensor = GetOutputTensor(context,
batch_size,
infer_request,
output_name,
subgraph_context_.output_names);
ort_tensor_key_t ort_tensor_key{output_name};
for (const auto& output_info : bindings_->network_outputs_) {
Ort::UnownedValue tensor = context.GetOutput(output_info.onnx_index, output_info.onnx_shape);

ort_tensor_key_t ort_tensor_key{output_info.name};
const auto& it = ort_ov_tensor_map.find(ort_tensor_key);
if ((it == ort_ov_tensor_map.end()) ||
(it != ort_ov_tensor_map.end() && (it->second.ort_ptr != tensor.GetTensorRawData()))) {
if ((it == ort_ov_tensor_map.end()) || (it->second.ort_ptr != tensor.GetTensorRawData())) {
ov_tensor_data_t ov_tensor_data;
const auto& output = graph_output_info.at(output_idx);
ov_tensor_data.ort_ptr = tensor.GetTensorRawData();
ov_tensor_data.tensor_ptr = std::make_shared<ov::Tensor>(output.get_element_type(), output.get_shape(),
ov_tensor_data.tensor_ptr = std::make_shared<ov::Tensor>(output_info.type, output_info.ov_shape.get_shape(),
const_cast<void*>(tensor.GetTensorRawData()));
ort_ov_tensor_map[ort_tensor_key] = ov_tensor_data;

try {
infer_request->SetTensor(std::move(output_name), ov_tensor_data.tensor_ptr);
infer_request->SetTensor(output_info.name, ov_tensor_data.tensor_ptr);
} catch (const char* msg) {
ORT_THROW(msg);
}
}
output_idx++;
}
}

Expand Down Expand Up @@ -611,44 +573,22 @@ void BasicBackend::StartRemoteAsyncInference(Ort::KernelContext& context, OVInfe
void BasicBackend::CompleteAsyncInference(Ort::KernelContext& context, OVInferRequestPtr infer_request) {
// Wait for Async inference completion
try {
bool cpu_or_gpu = session_context_.device_type.find("CPU") != std::string::npos ||
session_context_.device_type.find("GPU") != std::string::npos;

infer_request->WaitRequest();
auto graph_output_info = exe_network_.Get().outputs();
for (auto output_info_iter = graph_output_info.begin();
output_info_iter != graph_output_info.end(); ++output_info_iter) {
OVTensorPtr graph_output_blob;
auto output_names = output_info_iter->get_names();
std::string onnx_output_name;
std::string output_name;
bool output_name_found = false;
// using the output name retrieved from ONNX original to match with the output names returned by OV tensors
for (auto it = subgraph_context_.output_names.begin(); it != subgraph_context_.output_names.end(); ++it) {
onnx_output_name = it->first;
if (output_names.find(onnx_output_name) != output_names.end()) {
// Assigning the output_name
output_name = it->first;
output_name_found = true;
break;
}
}
if (!output_name_found) {
ORT_THROW(
log_tag +
"Output names mismatch between OpenVINO and ONNX. "
"[ONNX Output: ] " +
onnx_output_name +
" doesn't exist in the "
"list of OpenVINO output tensor names");
}
if ((session_context_.device_type.find("CPU") != std::string::npos ||
session_context_.device_type.find("GPU") != std::string::npos)) {

if (cpu_or_gpu) {
for (const auto& output_info : bindings_->network_outputs_) {
OVTensorPtr graph_output_blob;
try {
graph_output_blob = infer_request->GetTensor(output_name);
graph_output_blob = infer_request->GetTensor(output_info.name);
} catch (const char* msg) {
ORT_THROW(msg);
}
size_t batch_size = 1;
Ort::UnownedValue output_tensor =
GetOutputTensor(context, batch_size, infer_request, std::move(output_name), subgraph_context_.output_names);
GetOutputTensor(context, batch_size, infer_request, output_info.name, subgraph_context_.output_names);
auto mem_info = output_tensor.GetTensorMemoryInfo();
if (mem_info.GetAllocatorName() == OpenVINO_GPU) {
return;
Expand Down
46 changes: 45 additions & 1 deletion onnxruntime/core/providers/openvino/backends/basic_backend.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,14 @@
#include <mutex>
#include <map>
#include <functional>
#include <algorithm>
#include <utility>

#include "core/session/onnxruntime_cxx_api.h"
#include "core/providers/openvino/contexts.h"
#include "core/providers/openvino/ibackend.h"
#include "core/providers/openvino/ov_interface.h"
#include "core/providers/openvino/backend_utils.h"

namespace onnxruntime {
namespace openvino_ep {
Expand All @@ -27,6 +30,47 @@ struct ov_tensor_data_t {
const void* ort_ptr;
};

struct OnnxToOvNetworkBindings {
struct ParameterInfo {
std::string name;
uint32_t ov_index;
uint32_t onnx_index;
ov::element::Type type;
ov::PartialShape ov_shape;
std::vector<int64_t> onnx_shape;
};
std::vector<ParameterInfo> network_outputs_;
std::vector<ParameterInfo> network_inputs_;

OnnxToOvNetworkBindings(OVExeNetwork& exec_network, SubGraphContext& subgraph_context) {
auto populate = [&](auto& input_output_map, const SubGraphContext::string_index_map_t& onnx_input_map, const auto& ov_parameters) {
for (const auto& [onnx_name, onnx_param_index] : onnx_input_map) {
auto it = std::find_if(ov_parameters.begin(), ov_parameters.end(),
[&onnx_name](const auto& ov_parameter_info) { return ov_parameter_info.get_names().contains(onnx_name); });

ORT_ENFORCE(it != ov_parameters.end(), backend_utils::log_tag,
"Input names mismatch between OpenVINO and ONNX. ", onnx_name,
" doesn't exist in the list of OpenVINO input tensor names");

auto ov_param_index = std::distance(ov_parameters.begin(), it);

auto shape = ov_parameters[ov_param_index].get_partial_shape();
auto type = ov_parameters[ov_param_index].get_element_type();
ParameterInfo info{onnx_name, ov_param_index, onnx_param_index, type, shape};

if (shape.is_static()) {
auto static_shape = shape.get_shape();
std::transform(static_shape.begin(), static_shape.end(), std::back_inserter(info.onnx_shape), [](const auto& dim) { return static_cast<int64_t>(dim); });
}
input_output_map.push_back(std::move(info));
}
};

populate(network_inputs_, subgraph_context.input_names, exec_network.Get().inputs());
populate(network_outputs_, subgraph_context.output_names, exec_network.Get().outputs());
}
};

class InferRequestsQueue;
class BasicBackend : public IBackend {
public:
Expand All @@ -43,7 +87,6 @@ class BasicBackend : public IBackend {
}

private:
void PopulateCompiledDirectory(std::string, std::string&, std::string&, bool&);
bool ValidateSubgraph(std::map<std::string, std::shared_ptr<ov::Node>>& const_outputs_map);
void PopulateConfigValue(ov::AnyMap& device_config);
void EnableCaching();
Expand Down Expand Up @@ -71,6 +114,7 @@ class BasicBackend : public IBackend {

using ort_tensor_key_t = const std::string;
std::map<ort_tensor_key_t, ov_tensor_data_t> ort_ov_tensor_map;
std::unique_ptr<OnnxToOvNetworkBindings> bindings_;
};

class InferRequestsQueue {
Expand Down
Loading