intel
diff --git a/‎onnxruntime/core/providers/openvino/backend_manager.cc
Lines changed: 11 additions & 2 deletions b/‎onnxruntime/core/providers/openvino/backend_manager.cc
Lines changed: 11 additions & 2 deletions
diff --git a/‎onnxruntime/core/providers/openvino/backend_manager.h
Lines changed: 1 addition & 0 deletions b/‎onnxruntime/core/providers/openvino/backend_manager.h
Lines changed: 1 addition & 0 deletions
diff --git a/‎onnxruntime/core/providers/openvino/backends/basic_backend.cc
Lines changed: 53 additions & 7 deletions b/‎onnxruntime/core/providers/openvino/backends/basic_backend.cc
Lines changed: 53 additions & 7 deletions
diff --git a/‎onnxruntime/core/providers/openvino/backends/basic_backend.h
Lines changed: 2 additions & 1 deletion b/‎onnxruntime/core/providers/openvino/backends/basic_backend.h
Lines changed: 2 additions & 1 deletion
diff --git a/‎onnxruntime/core/providers/openvino/contexts.h
Lines changed: 1 addition & 0 deletions b/‎onnxruntime/core/providers/openvino/contexts.h
Lines changed: 1 addition & 0 deletions
diff --git a/‎onnxruntime/core/providers/openvino/ibackend.h
Lines changed: 1 addition & 0 deletions b/‎onnxruntime/core/providers/openvino/ibackend.h
Lines changed: 1 addition & 0 deletions
diff --git a/‎onnxruntime/core/providers/openvino/openvino_execution_provider.cc
Lines changed: 19 additions & 0 deletions b/‎onnxruntime/core/providers/openvino/openvino_execution_provider.cc
Lines changed: 19 additions & 0 deletions
diff --git a/‎onnxruntime/core/providers/openvino/openvino_provider_factory.cc
Lines changed: 8 additions & 1 deletion b/‎onnxruntime/core/providers/openvino/openvino_provider_factory.cc
Lines changed: 8 additions & 1 deletion
@@ -105,7 +105,8 @@ BackendManager::BackendManager(SessionContext& session_context,
     subgraph_context_.has_dynamic_input_shape = true;
     LOGS_DEFAULT(INFO) << "[OpenVINO-EP] Model has symbolic input dims";
     if ((session_context_.device_type.find("CPU") != std::string::npos ||
-         session_context_.device_type.find("GPU") != std::string::npos) &&
+         session_context_.device_type.find("GPU") != std::string::npos ||
+         (session_context_.device_type.find("NPU") != std::string::npos && session_context_.enable_causallm)) &&
         !session_context_.disable_dynamic_shapes) {
       LOGS_DEFAULT(INFO) << "[OpenVINO-EP] Starting backend initialization. "
                          << "Creating backend Dynamic Shapes";
@@ -492,7 +493,9 @@ void BackendManager::Compute(OrtKernelContext* context) {
   if (subgraph_context_.has_dynamic_input_shape &&
       !session_context_.disable_dynamic_shapes &&
       (session_context_.device_type.find("CPU") != std::string::npos ||
-       session_context_.device_type.find("GPU") != std::string::npos)) {
+       session_context_.device_type.find("GPU") != std::string::npos ||
+       (session_context_.device_type.find("NPU") != std::string::npos &&
+        session_context_.enable_causallm))) {
     concrete_backend_->Infer(context);
   } else if (subgraph_context_.has_dynamic_input_shape) {
     std::vector<std::vector<int64_t>> tensor_shapes = GetInputTensorShapes(ctx);
@@ -565,5 +568,11 @@ void BackendManager::ShutdownBackendManager() {
   concrete_backend_.reset();
 }
 
+void BackendManager::RewindKVCache(size_t index) {
+  if (concrete_backend_) {
+    concrete_backend_->RewindKVCache(index);
+  }
+}
+
 }  // namespace openvino_ep
 }  // namespace onnxruntime
@@ -30,6 +30,7 @@ class BackendManager {
   SessionContext& GetSessionContext();
   Status ExportCompiledBlobAsEPCtxNode(const onnxruntime::GraphViewer& subgraph);
   ov::CompiledModel& GetOVCompiledModel();
+  void RewindKVCache(size_t index);
 
  private:
   std::unique_ptr<ONNX_NAMESPACE::ModelProto> GetModelProtoFromFusedNode(
 
@@ -15,6 +15,7 @@
 #include "core/providers/openvino/backends/basic_backend.h"
 #include "core/providers/openvino/onnx_ctx_model_helper.h"
 #include "core/providers/openvino/backend_manager.h"
+#include "core/providers/openvino/ov_stateful_patch_utils.h"
 
 namespace onnxruntime {
 
@@ -29,6 +30,7 @@ BasicBackend::BasicBackend(std::unique_ptr<ONNX_NAMESPACE::ModelProto>& model_pr
                            ptr_stream_t& model_stream)
     : session_context_{session_context}, subgraph_context_{subgraph_context}, shared_context_{shared_context} {
   std::string& hw_target = session_context_.device_type;
+  bool enable_causallm = session_context_.enable_causallm;
 
   if (ValidateSubgraph(const_outputs_map_))
     return;
@@ -43,7 +45,7 @@ BasicBackend::BasicBackend(std::unique_ptr<ONNX_NAMESPACE::ModelProto>& model_pr
   // Setting OpenCL queue throttling for GPU
   EnableGPUThrottling(device_config);
 
-  // Enable streams; default=1 unless ovverriden by user config
+  // Enable streams; default=1 unless overridden by user configuration
   EnableStreams();
 
   // Set the inference_num_threads property of the CPU
@@ -95,7 +97,7 @@ BasicBackend::BasicBackend(std::unique_ptr<ONNX_NAMESPACE::ModelProto>& model_pr
     } else if (!session_context_.has_external_weights &&
                !subgraph_context_.has_dynamic_input_shape &&
                !session_context_.so_context_enable &&
-               auto_unified_compile) {
+               !enable_causallm && auto_unified_compile) {
       // Unified OV compile_model is efficient when ov model caching is enabled
       // Unified OV compile_model API is supported with AUTO from version 2024.3 and above
       // Inputs with static dimensions
@@ -115,7 +117,7 @@ BasicBackend::BasicBackend(std::unique_ptr<ONNX_NAMESPACE::ModelProto>& model_pr
       }
       auto ov_model = CreateOVModel(std::move(model), session_context_, const_outputs_map_);
       exe_network_ = OVCore::Get()->CompileModel(
-          ov_model, hw_target, device_config, subgraph_context_.subgraph_name);
+          ov_model, hw_target, device_config, enable_causallm, subgraph_context_.subgraph_name);
     }
 #endif
     LOGS_DEFAULT(INFO) << log_tag << "Loaded model to the plugin";
@@ -200,6 +202,15 @@ void BasicBackend::PopulateConfigValue(ov::AnyMap& device_config) {
   if (!session_context_.load_config.empty()) {
     const std::map<std::string, ov::AnyMap>& target_config = session_context_.load_config;
 
+    if ((session_context_.device_type.find("NPU") != std::string::npos) && session_context_.enable_causallm) {
+      if (target_config.find("NPU") != target_config.end()) {
+        auto npu_genai_config = target_config.at("NPU");
+        CausalLMConfig().ApplyConfig(npu_genai_config, device_config);
+      } else {
+        LOGS_DEFAULT(WARNING) << "ORT GenAI CausalLMConfig Configuration not found.";
+      }
+    }
+
     if (session_context_.device_type.find("NPU") != std::string::npos) {
       auto npuw_config = target_config.at("NPU");
 
@@ -265,7 +276,8 @@ void BasicBackend::PopulateConfigValue(ov::AnyMap& device_config) {
     auto set_target_properties = [&](const std::string& device, const ov::AnyMap& config_options,
                                      const std::vector<ov::PropertyName>& supported_properties) {
       for (const auto& [key, value] : config_options) {
-        if (key.find("NPUW") != std::string::npos) {
+        if ((key.find("NPUW") != std::string::npos) ||
+            ((device_config.find(key) != device_config.end()) && session_context_.enable_causallm)) {
           continue;
         }
         if (is_supported_and_mutable(key, supported_properties)) {
@@ -358,6 +370,13 @@ void BasicBackend::SetNumThreads(ov::AnyMap& device_config) {
     device_config.emplace(ov::inference_num_threads(session_context_.num_of_threads));
 }
 
+void BasicBackend::RewindKVCache(size_t index) {
+  OVInferRequestPtr infer_request;
+  infer_request = inferRequestsQueue_->getIdleRequest();
+  infer_request->RewindKVCache(index);
+  inferRequestsQueue_->putIdleRequest(std::move(infer_request));
+}
+
 // Starts an asynchronous inference request for data in slice indexed by batch_slice_idx on
 // an Infer Request indexed by infer_req_idx
 void BasicBackend::StartAsyncInference(Ort::KernelContext& context, OVInferRequestPtr infer_request) {
@@ -376,14 +395,22 @@ void BasicBackend::StartAsyncInference(Ort::KernelContext& context, OVInferReque
         }
         index++;
       }
+
+      // For Stateful Model Compilation, the ONNX model includes KV cache (past/present) tensors.
+      // However, these tensors are internally converted to a stateful representation, which removes them.
+      // To prevent runtime exceptions, we simply continue processing here.
+      if (input_name.empty() || input_name == "beam_idx") continue;
+
       ORT_ENFORCE(!input_name.empty(), log_tag,
                   "Input names mismatch between OpenVINO and ONNX. ", onnx_input_name,
                   " doesn't exist in the list of OpenVINO input tensor names");
       size_t batch_slice_idx = 0;
       if (subgraph_context_.has_dynamic_input_shape &&
           !session_context_.disable_dynamic_shapes &&
           (session_context_.device_type.find("CPU") != std::string::npos ||
-           session_context_.device_type.find("GPU") != std::string::npos)) {
+           session_context_.device_type.find("GPU") != std::string::npos ||
+           (session_context_.device_type.find("NPU") != std::string::npos &&
+            session_context_.enable_causallm))) {
         auto tensor = context.GetInput(subgraph_context_.input_names.at(input_name));
         auto tensor_info = tensor.GetTensorTypeAndShapeInfo();
         auto tensor_shape = tensor_info.GetShape();
@@ -445,7 +472,8 @@ void BasicBackend::StartAsyncInference(Ort::KernelContext& context, OVInferReque
       }
     }  // Loop subgraph original input names
 
-    if (session_context_.device_type.find("NPU") != std::string::npos) {
+    // For Stateful Compilation i.e. enable_causallm as True, we use the dynamic shapes path for NPU plugin as well.
+    if (session_context_.device_type.find("NPU") != std::string::npos && !session_context_.enable_causallm) {
       // Set the output blob as remote blob
       auto graph_output_info = exe_network_.Get().outputs();
       auto output_idx = 0;
@@ -640,7 +668,9 @@ void BasicBackend::CompleteAsyncInference(Ort::KernelContext& context, OVInferRe
             "list of OpenVINO output tensor names");
       }
       if ((session_context_.device_type.find("CPU") != std::string::npos ||
-           session_context_.device_type.find("GPU") != std::string::npos)) {
+           session_context_.device_type.find("GPU") != std::string::npos ||
+           (session_context_.device_type.find("NPU") != std::string::npos &&
+            session_context_.enable_causallm))) {
         try {
           graph_output_blob = infer_request->GetTensor(output_name);
         } catch (const char* msg) {
@@ -719,25 +749,41 @@ void BasicBackend::Infer(OrtKernelContext* ctx) {
       try {
         StartRemoteAsyncInference(context, infer_request);
       } catch (std::string const& msg) {
+        // If the inference fails (exception from ov::InferRequest::infer()),
+        // we need to put the infer_request back into the pool to avoid deadlocks
+        // and to allow the next inference request to proceed.
+        inferRequestsQueue_->putIdleRequest(std::move(infer_request));
         ORT_THROW(msg);
       }
     } else {
       try {
         StartAsyncInference(context, infer_request);
       } catch (std::string const& msg) {
+        // If the inference fails (exception from ov::InferRequest::infer()),
+        // we need to put the infer_request back into the pool to avoid deadlocks
+        // and to allow the next inference request to proceed.
+        inferRequestsQueue_->putIdleRequest(std::move(infer_request));
         ORT_THROW(msg);
       }
     }
 #else
     try {
       StartAsyncInference(context, infer_request);
     } catch (const std::runtime_error& e) {
+      // If the inference fails (exception from ov::InferRequest::infer()),
+      // we need to put the infer_request back into the pool to avoid deadlocks
+      // and to allow the next inference request to proceed.
+      inferRequestsQueue_->putIdleRequest(std::move(infer_request));
       ORT_THROW(log_tag + " Exception at StartAsyncInference: " + e.what());
     }
 #endif
     try {
       CompleteAsyncInference(context, infer_request);
     } catch (const std::runtime_error& e) {
+      // If the inference fails (exception from ov::InferRequest::infer()),
+      // we need to put the infer_request back into the pool to avoid deadlocks
+      // and to allow the next inference request to proceed.
+      inferRequestsQueue_->putIdleRequest(std::move(infer_request));
       ORT_THROW(log_tag + " Exception at CompleteAsyncInference: " + e.what());
     }
 
 
@@ -41,6 +41,7 @@ class BasicBackend : public IBackend {
   ov::CompiledModel& GetOVCompiledModel() override {
     return exe_network_.Get();
   }
+  void RewindKVCache(size_t index) override;
 
  private:
   void PopulateCompiledDirectory(std::string, std::string&, std::string&, bool&);
@@ -78,7 +79,7 @@ class InferRequestsQueue {
   InferRequestsQueue(OVExeNetwork& net, size_t nireq, std::function<void(OVInferRequestPtr)> initializer) {
     OVInferRequestPtr infer_request;
     for (size_t id = 0; id < nireq; id++) {
-      infer_request = std::make_shared<OVInferRequest>(net.CreateInferRequest());
+      infer_request = net.CreateInferRequest();
       initializer(infer_request);
       infer_requests_.push_back(infer_request);
     }
 
@@ -97,6 +97,7 @@ struct ProviderInfo {
   bool disable_dynamic_shapes{false};      // [disable_dynamic_shapes]:  Rewrite dynamic shaped models to
                                            // static shape at runtime and execute.
   bool enable_qdq_optimizer{false};        // Enables QDQ pruning for efficient inference latency with NPU
+  bool enable_causallm{false};             // Enables Causal LM Compilation for ORT GenAI OVEP Pass
   bool so_context_enable{false};           // ORT session option
   bool so_disable_cpu_ep_fallback{false};  // ORT session option
   bool so_context_embed_mode{false};       // ORT session option
 
@@ -17,6 +17,7 @@ class IBackend {
   virtual void Infer(OrtKernelContext* context) = 0;
   virtual ov::CompiledModel& GetOVCompiledModel() = 0;
   virtual ~IBackend() = default;
+  virtual void RewindKVCache(size_t index) {};
 };
 using ptr_stream_t = std::unique_ptr<std::istream>;
 class BackendFactory {
 
@@ -244,6 +244,25 @@ common::Status OpenVINOExecutionProvider::SetEpDynamicOptions(gsl::span<const ch
           ov_compiled_model.set_property(ov::workload_type(workload_type));
         }
       }
+    } else if (key == "kvcache_rewind") {
+      // Convert kvcache_rewind value to int64_t
+      int64_t index;
+      try {
+        index = std::stoll(value);
+      } catch (const std::exception& e) {
+        LOGS_DEFAULT(WARNING) << "Conversion for kvcache_rewind string value to int64_t index failed."
+                              << "Exception:" + std::string(e.what());
+        return Status::OK();
+      }
+
+      // Trigger KVCache Rewind for target Backend
+      for (auto& backend : backend_managers_) {
+        if (index >= 0) {
+          backend.RewindKVCache(static_cast<size_t>(index));
+        } else {
+          LOGS_DEFAULT(WARNING) << "kvcache_rewind index is < 0:\t" << index;
+        }
+      }
     } else {
       // Handle unknown options
       LOGS_DEFAULT(WARNING) << "Unknown key/value pair - ignoring " << key << "/" << value;
 
@@ -343,13 +343,20 @@ static void ParseProviderInfo(const ProviderOptions& provider_options,
 
     pi.enable_qdq_optimizer = ParseBooleanOption(provider_options, "enable_qdq_optimizer");
 
+    pi.enable_causallm = ParseBooleanOption(provider_options, "enable_causallm");
+
     pi.disable_dynamic_shapes = ParseBooleanOption(provider_options, "disable_dynamic_shapes");
   } catch (std::string msg) {
     ORT_THROW(msg);
   }
   // Always true for NPU plugin or when passed .
   if (pi.device_type.find("NPU") != std::string::npos) {
-    pi.disable_dynamic_shapes = true;
+    // For Stateful Compilation i.e. enable_causallm as True, we use the dynamic shapes path.
+    if (pi.enable_causallm) {
+      pi.disable_dynamic_shapes = false;
+    } else {
+      pi.disable_dynamic_shapes = true;
+    }
   }
 }
Original file line number	Diff line number	Diff line change
`@@ -343,13 +343,20 @@ static void ParseProviderInfo(const ProviderOptions& provider_options,`
`343`	`343`
`344`	`344`	`pi.enable_qdq_optimizer = ParseBooleanOption(provider_options, "enable_qdq_optimizer");`
`345`	`345`
	`346`	`+ pi.enable_causallm = ParseBooleanOption(provider_options, "enable_causallm");`
	`347`	`+`
`346`	`348`	`pi.disable_dynamic_shapes = ParseBooleanOption(provider_options, "disable_dynamic_shapes");`
`347`	`349`	`} catch (std::string msg) {`
`348`	`350`	`ORT_THROW(msg);`
`349`	`351`	`}`
`350`	`352`	`// Always true for NPU plugin or when passed .`
`351`	`353`	`if (pi.device_type.find("NPU") != std::string::npos) {`
`352`		`- pi.disable_dynamic_shapes = true;`
	`354`	`+ // For Stateful Compilation i.e. enable_causallm as True, we use the dynamic shapes path.`
	`355`	`+ if (pi.enable_causallm) {`
	`356`	`+ pi.disable_dynamic_shapes = false;`
	`357`	`+ } else {`
	`358`	`+ pi.disable_dynamic_shapes = true;`
	`359`	`+ }`
`353`	`360`	`}`
`354`	`361`	`}`
`355`	`362`