From 6cd625c4ee5fcfe5ba8770da4c0452005bb0418b Mon Sep 17 00:00:00 2001
From: Andrews Arokiam <andrews.arokiam@ideas2it.com>
Date: Wed, 11 Oct 2023 14:44:49 +0530
Subject: [PATCH 1/3] Added docs for raw deployment autoscaling.

Signed-off-by: Andrews Arokiam <andrews.arokiam@ideas2it.com>
---
 docs/modelserving/autoscaling/autoscaling.md | 91 +++++++++++++++++++-
 1 file changed, 89 insertions(+), 2 deletions(-)

diff --git a/docs/modelserving/autoscaling/autoscaling.md b/docs/modelserving/autoscaling/autoscaling.md
index 7071518bf..17a3d02fb 100644
--- a/docs/modelserving/autoscaling/autoscaling.md
+++ b/docs/modelserving/autoscaling/autoscaling.md
@@ -1,6 +1,8 @@
 # Autoscale InferenceService with inference workload
 
-## InferenceService with target concurrency
+## Autoscaler for kserve's Serverless
+
+### InferenceService with target concurrency
 
 ### Create `InferenceService`
 
@@ -497,4 +499,89 @@ This allows more flexibility in terms of the autoscaling configuration. In a typ
               - mnist
     ```
 Apply the `autoscale-adv.yaml` to create the Autoscale InferenceService.
-The default for scaleMetric is `concurrency` and possible values are `concurrency`, `rps`, `cpu` and `memory`.
\ No newline at end of file
+The default for scaleMetric is `concurrency` and possible values are `concurrency`, `rps`, `cpu` and `memory`.
+
+## Autoscaler for Kserve's Raw Deployment Mode
+
+KServe supports `RawDeployment` mode to enable `InferenceService` deployment with Kubernetes resources [`Deployment`](https://kubernetes.io/docs/concepts/workloads/controllers/deployment), [`Service`](https://kubernetes.io/docs/concepts/services-networking/service), [`Ingress`](https://kubernetes.io/docs/concepts/services-networking/ingress) and [`Horizontal Pod Autoscaler`](https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale). Comparing to serverless deployment it unlocks Knative limitations such as mounting multiple volumes, on the other hand `Scale down and from Zero` is not supported in `RawDeployment` mode.
+
+### HPA in Raw Deployment
+
+When using Kserve with the `RawDeployment` mode, Knative is not installed. In this mode, if you deploy an `InferenceService`, Kserve uses **Kubernetes’ Horizontal Pod Autoscaler (HPA)** for autoscaling instead of **Knative Pod Autoscaler (KPA)**. For more information about Kserve's autoscaler, you can refer [`this`](https://kserve.github.io/website/master/modelserving/v1beta1/torchserve/#knative-autoscaler)
+
+
+=== "Old Schema"
+
+    ```yaml
+    apiVersion: "serving.kserve.io/v1beta1"
+    kind: "InferenceService"
+    metadata:
+      name: "sklearn-iris-hpa"
+      annotations:
+        serving.kserve.io/deploymentMode: RawDeployment
+        serving.kserve.io/autoscalerClass: hpa
+        serving.kserve.io/metric: cpu
+        serving.kserve.io/targetUtilizationPercentage: "80"
+    spec:
+      predictor:
+        sklearn:
+          storageUri: "gs://kfserving-examples/models/sklearn/1.0/model"
+    ```
+
+=== "New Schema"
+
+    ```yaml
+    apiVersion: "serving.kserve.io/v1beta1"
+    kind: "InferenceService"
+    metadata:
+      name: "sklearn-iris-hpa"
+      annotations:
+        serving.kserve.io/deploymentMode: RawDeployment
+        serving.kserve.io/autoscalerClass: hpa
+        serving.kserve.io/metric: cpu
+        serving.kserve.io/targetUtilizationPercentage: "80"
+    spec:
+      predictor:
+        model:
+          modelFormat:
+            name: sklearn
+          storageUri: "gs://kfserving-examples/models/sklearn/1.0/model"
+    ```
+
+### Disable HPA in Raw Deployment
+
+If you want to control the scaling of the deployment created by KServe inference service with an external tool like [`KEDA`](https://keda.sh/). You can disable KServe's creation of the **HPA** by replacing **external** value with autoscaler class annotaion that should be disable the creation of HPA
+
+=== "Old Schema"
+
+    ```yaml
+    apiVersion: "serving.kserve.io/v1beta1"
+    kind: "InferenceService"
+    metadata:
+      annotations:
+        serving.kserve.io/deploymentMode: RawDeployment
+        serving.kserve.io/autoscalerClass: external
+      name: "sklearn-iris"
+    spec:
+      predictor:
+        sklearn:
+          storageUri: "gs://kfserving-examples/models/sklearn/1.0/model"
+    ```
+
+=== "New Schema"
+
+    ```yaml
+    apiVersion: "serving.kserve.io/v1beta1"
+    kind: "InferenceService"
+    metadata:
+      annotations:
+        serving.kserve.io/deploymentMode: RawDeployment
+        serving.kserve.io/autoscalerClass: external
+      name: "sklearn-iris"
+    spec:
+      predictor:
+        model:
+          modelFormat:
+            name: sklearn
+          storageUri: "gs://kfserving-examples/models/sklearn/1.0/model"
+    ```

From 257ad87f1c2330d12217e1255a20610766e32e95 Mon Sep 17 00:00:00 2001
From: Andrews Arokiam <andrews.arokiam@ideas2it.com>
Date: Mon, 6 Nov 2023 14:33:32 +0530
Subject: [PATCH 2/3] Schema order changed.

Signed-off-by: Andrews Arokiam <andrews.arokiam@ideas2it.com>
---
 docs/modelserving/autoscaling/autoscaling.md | 24 ++++++++++----------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/docs/modelserving/autoscaling/autoscaling.md b/docs/modelserving/autoscaling/autoscaling.md
index 17a3d02fb..6b0fbf0ef 100644
--- a/docs/modelserving/autoscaling/autoscaling.md
+++ b/docs/modelserving/autoscaling/autoscaling.md
@@ -510,7 +510,7 @@ KServe supports `RawDeployment` mode to enable `InferenceService` deployment wit
 When using Kserve with the `RawDeployment` mode, Knative is not installed. In this mode, if you deploy an `InferenceService`, Kserve uses **Kubernetes’ Horizontal Pod Autoscaler (HPA)** for autoscaling instead of **Knative Pod Autoscaler (KPA)**. For more information about Kserve's autoscaler, you can refer [`this`](https://kserve.github.io/website/master/modelserving/v1beta1/torchserve/#knative-autoscaler)
 
 
-=== "Old Schema"
+=== "New Schema"
 
     ```yaml
     apiVersion: "serving.kserve.io/v1beta1"
@@ -524,11 +524,13 @@ When using Kserve with the `RawDeployment` mode, Knative is not installed. In th
         serving.kserve.io/targetUtilizationPercentage: "80"
     spec:
       predictor:
-        sklearn:
+        model:
+          modelFormat:
+            name: sklearn
           storageUri: "gs://kfserving-examples/models/sklearn/1.0/model"
     ```
 
-=== "New Schema"
+=== "Old Schema"
 
     ```yaml
     apiVersion: "serving.kserve.io/v1beta1"
@@ -542,9 +544,7 @@ When using Kserve with the `RawDeployment` mode, Knative is not installed. In th
         serving.kserve.io/targetUtilizationPercentage: "80"
     spec:
       predictor:
-        model:
-          modelFormat:
-            name: sklearn
+        sklearn:
           storageUri: "gs://kfserving-examples/models/sklearn/1.0/model"
     ```
 
@@ -552,7 +552,7 @@ When using Kserve with the `RawDeployment` mode, Knative is not installed. In th
 
 If you want to control the scaling of the deployment created by KServe inference service with an external tool like [`KEDA`](https://keda.sh/). You can disable KServe's creation of the **HPA** by replacing **external** value with autoscaler class annotaion that should be disable the creation of HPA
 
-=== "Old Schema"
+=== "New Schema"
 
     ```yaml
     apiVersion: "serving.kserve.io/v1beta1"
@@ -564,11 +564,13 @@ If you want to control the scaling of the deployment created by KServe inference
       name: "sklearn-iris"
     spec:
       predictor:
-        sklearn:
+        model:
+          modelFormat:
+            name: sklearn
           storageUri: "gs://kfserving-examples/models/sklearn/1.0/model"
     ```
 
-=== "New Schema"
+=== "Old Schema"
 
     ```yaml
     apiVersion: "serving.kserve.io/v1beta1"
@@ -580,8 +582,6 @@ If you want to control the scaling of the deployment created by KServe inference
       name: "sklearn-iris"
     spec:
       predictor:
-        model:
-          modelFormat:
-            name: sklearn
+        sklearn:
           storageUri: "gs://kfserving-examples/models/sklearn/1.0/model"
     ```

From 83ffb79948361982657114bf048aa14bc4f08c6c Mon Sep 17 00:00:00 2001
From: Andrews Arokiam <andrews.arokiam@ideas2it.com>
Date: Mon, 20 Jan 2025 08:01:45 +0530
Subject: [PATCH 3/3] code review changes

Signed-off-by: Andrews Arokiam <andrews.arokiam@ideas2it.com>
---
 docs/modelserving/autoscaling/autoscaling.md  | 85 ------------------
 .../raw_deployment_autoscalling.md            | 89 +++++++++++++++++++
 mkdocs.yml                                    |  1 +
 3 files changed, 90 insertions(+), 85 deletions(-)
 create mode 100644 docs/modelserving/autoscaling/raw_deployment_autoscalling.md

diff --git a/docs/modelserving/autoscaling/autoscaling.md b/docs/modelserving/autoscaling/autoscaling.md
index 6b0fbf0ef..6061e25b1 100644
--- a/docs/modelserving/autoscaling/autoscaling.md
+++ b/docs/modelserving/autoscaling/autoscaling.md
@@ -500,88 +500,3 @@ This allows more flexibility in terms of the autoscaling configuration. In a typ
     ```
 Apply the `autoscale-adv.yaml` to create the Autoscale InferenceService.
 The default for scaleMetric is `concurrency` and possible values are `concurrency`, `rps`, `cpu` and `memory`.
-
-## Autoscaler for Kserve's Raw Deployment Mode
-
-KServe supports `RawDeployment` mode to enable `InferenceService` deployment with Kubernetes resources [`Deployment`](https://kubernetes.io/docs/concepts/workloads/controllers/deployment), [`Service`](https://kubernetes.io/docs/concepts/services-networking/service), [`Ingress`](https://kubernetes.io/docs/concepts/services-networking/ingress) and [`Horizontal Pod Autoscaler`](https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale). Comparing to serverless deployment it unlocks Knative limitations such as mounting multiple volumes, on the other hand `Scale down and from Zero` is not supported in `RawDeployment` mode.
-
-### HPA in Raw Deployment
-
-When using Kserve with the `RawDeployment` mode, Knative is not installed. In this mode, if you deploy an `InferenceService`, Kserve uses **Kubernetes’ Horizontal Pod Autoscaler (HPA)** for autoscaling instead of **Knative Pod Autoscaler (KPA)**. For more information about Kserve's autoscaler, you can refer [`this`](https://kserve.github.io/website/master/modelserving/v1beta1/torchserve/#knative-autoscaler)
-
-
-=== "New Schema"
-
-    ```yaml
-    apiVersion: "serving.kserve.io/v1beta1"
-    kind: "InferenceService"
-    metadata:
-      name: "sklearn-iris-hpa"
-      annotations:
-        serving.kserve.io/deploymentMode: RawDeployment
-        serving.kserve.io/autoscalerClass: hpa
-        serving.kserve.io/metric: cpu
-        serving.kserve.io/targetUtilizationPercentage: "80"
-    spec:
-      predictor:
-        model:
-          modelFormat:
-            name: sklearn
-          storageUri: "gs://kfserving-examples/models/sklearn/1.0/model"
-    ```
-
-=== "Old Schema"
-
-    ```yaml
-    apiVersion: "serving.kserve.io/v1beta1"
-    kind: "InferenceService"
-    metadata:
-      name: "sklearn-iris-hpa"
-      annotations:
-        serving.kserve.io/deploymentMode: RawDeployment
-        serving.kserve.io/autoscalerClass: hpa
-        serving.kserve.io/metric: cpu
-        serving.kserve.io/targetUtilizationPercentage: "80"
-    spec:
-      predictor:
-        sklearn:
-          storageUri: "gs://kfserving-examples/models/sklearn/1.0/model"
-    ```
-
-### Disable HPA in Raw Deployment
-
-If you want to control the scaling of the deployment created by KServe inference service with an external tool like [`KEDA`](https://keda.sh/). You can disable KServe's creation of the **HPA** by replacing **external** value with autoscaler class annotaion that should be disable the creation of HPA
-
-=== "New Schema"
-
-    ```yaml
-    apiVersion: "serving.kserve.io/v1beta1"
-    kind: "InferenceService"
-    metadata:
-      annotations:
-        serving.kserve.io/deploymentMode: RawDeployment
-        serving.kserve.io/autoscalerClass: external
-      name: "sklearn-iris"
-    spec:
-      predictor:
-        model:
-          modelFormat:
-            name: sklearn
-          storageUri: "gs://kfserving-examples/models/sklearn/1.0/model"
-    ```
-
-=== "Old Schema"
-
-    ```yaml
-    apiVersion: "serving.kserve.io/v1beta1"
-    kind: "InferenceService"
-    metadata:
-      annotations:
-        serving.kserve.io/deploymentMode: RawDeployment
-        serving.kserve.io/autoscalerClass: external
-      name: "sklearn-iris"
-    spec:
-      predictor:
-        sklearn:
-          storageUri: "gs://kfserving-examples/models/sklearn/1.0/model"
-    ```
diff --git a/docs/modelserving/autoscaling/raw_deployment_autoscalling.md b/docs/modelserving/autoscaling/raw_deployment_autoscalling.md
new file mode 100644
index 000000000..19e250e64
--- /dev/null
+++ b/docs/modelserving/autoscaling/raw_deployment_autoscalling.md
@@ -0,0 +1,89 @@
+## Autoscaler for Kserve's Raw Deployment Mode
+
+KServe supports `RawDeployment` mode to enable `InferenceService` deployment with Kubernetes resources [`Deployment`](https://kubernetes.io/docs/concepts/workloads/controllers/deployment), [`Service`](https://kubernetes.io/docs/concepts/services-networking/service), [`Ingress`](https://kubernetes.io/docs/concepts/services-networking/ingress) and [`Horizontal Pod Autoscaler`](https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale). Comparing to serverless deployment it unlocks Knative limitations such as mounting multiple volumes, on the other hand `Scale down and from Zero` is not supported in `RawDeployment` mode.
+
+### HPA in Raw Deployment
+
+When using Kserve with the `RawDeployment` mode, Knative is not installed. In this mode, if you deploy an `InferenceService`, Kserve uses **Kubernetes’ Horizontal Pod Autoscaler (HPA)** for autoscaling instead of **Knative Pod Autoscaler (KPA)**. For more information about Kserve's autoscaler, you can refer [`this`](https://kserve.github.io/website/master/modelserving/v1beta1/torchserve/#knative-autoscaler)
+
+
+=== "New Schema"
+
+    ```yaml
+    apiVersion: "serving.kserve.io/v1beta1"
+    kind: "InferenceService"
+    metadata:
+      name: "sklearn-iris-hpa"
+      annotations:
+        serving.kserve.io/deploymentMode: RawDeployment
+        serving.kserve.io/autoscalerClass: hpa
+    spec:
+      predictor:
+        scaleTarget: 80
+        scaleMetric: cpu
+        model:
+          modelFormat:
+            name: sklearn
+          storageUri: "gs://kfserving-examples/models/sklearn/1.0/model"
+    ```
+
+=== "Old Schema"
+
+    ```yaml
+    apiVersion: "serving.kserve.io/v1beta1"
+    kind: "InferenceService"
+    metadata:
+      name: "sklearn-iris-hpa"
+      annotations:
+        serving.kserve.io/deploymentMode: RawDeployment
+        serving.kserve.io/autoscalerClass: hpa
+        serving.kserve.io/metric: cpu
+        serving.kserve.io/targetUtilizationPercentage: "80"
+    spec:
+      predictor:
+        sklearn:
+          storageUri: "gs://kfserving-examples/models/sklearn/1.0/model"
+    ```
+
+`ScaleTarget` specifies the integer target value of the metric type the Autoscaler watches for. concurrency and rps targets are supported by Knative Pod Autoscaler. you can refer [`this`](https://knative.dev/docs/serving/autoscaling/autoscaling-targets/).
+
+`ScaleMetric` defines the scaling metric type watched by autoscaler. Possible values are concurrency, rps, cpu, memory. concurrency, rps are supported via Knative Pod Autoscaler. you can refer [`this`](https://knative.dev/docs/serving/autoscaling/autoscaling-metrics).
+
+
+### Disable HPA in Raw Deployment
+
+If you want to control the scaling of the deployment created by KServe inference service with an external tool like [`KEDA`](https://keda.sh/). You can disable KServe's creation of the **HPA** by replacing **external** value with autoscaler class annotaion that should be disable the creation of HPA
+
+=== "New Schema"
+
+    ```yaml
+    apiVersion: "serving.kserve.io/v1beta1"
+    kind: "InferenceService"
+    metadata:
+      annotations:
+        serving.kserve.io/deploymentMode: RawDeployment
+        serving.kserve.io/autoscalerClass: external
+      name: "sklearn-iris"
+    spec:
+      predictor:
+        model:
+          modelFormat:
+            name: sklearn
+          storageUri: "gs://kfserving-examples/models/sklearn/1.0/model"
+    ```
+
+=== "Old Schema"
+
+    ```yaml
+    apiVersion: "serving.kserve.io/v1beta1"
+    kind: "InferenceService"
+    metadata:
+      annotations:
+        serving.kserve.io/deploymentMode: RawDeployment
+        serving.kserve.io/autoscalerClass: external
+      name: "sklearn-iris"
+    spec:
+      predictor:
+        sklearn:
+          storageUri: "gs://kfserving-examples/models/sklearn/1.0/model"
+    ```
diff --git a/mkdocs.yml b/mkdocs.yml
index 346b6c88d..ca6286152 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -94,6 +94,7 @@ nav:
                 - Inference Logger: modelserving/logger/logger.md
           - Autoscaling:
                 - Inference Autoscaling: modelserving/autoscaling/autoscaling.md
+                - Raw Deployment Autoscaling: modelserving/autoscaling/raw_deployment_autoscaling.md
           - Node Scheduling:
                 - Overview: modelserving/nodescheduling/overview.md
                 - InferenceService Node Scheduling: modelserving/nodescheduling/inferenceservicenodescheduling.md