From 6cd625c4ee5fcfe5ba8770da4c0452005bb0418b Mon Sep 17 00:00:00 2001 From: Andrews Arokiam Date: Wed, 11 Oct 2023 14:44:49 +0530 Subject: [PATCH 1/3] Added docs for raw deployment autoscaling. Signed-off-by: Andrews Arokiam --- docs/modelserving/autoscaling/autoscaling.md | 91 +++++++++++++++++++- 1 file changed, 89 insertions(+), 2 deletions(-) diff --git a/docs/modelserving/autoscaling/autoscaling.md b/docs/modelserving/autoscaling/autoscaling.md index 7071518bf..17a3d02fb 100644 --- a/docs/modelserving/autoscaling/autoscaling.md +++ b/docs/modelserving/autoscaling/autoscaling.md @@ -1,6 +1,8 @@ # Autoscale InferenceService with inference workload -## InferenceService with target concurrency +## Autoscaler for kserve's Serverless + +### InferenceService with target concurrency ### Create `InferenceService` @@ -497,4 +499,89 @@ This allows more flexibility in terms of the autoscaling configuration. In a typ - mnist ``` Apply the `autoscale-adv.yaml` to create the Autoscale InferenceService. -The default for scaleMetric is `concurrency` and possible values are `concurrency`, `rps`, `cpu` and `memory`. \ No newline at end of file +The default for scaleMetric is `concurrency` and possible values are `concurrency`, `rps`, `cpu` and `memory`. + +## Autoscaler for Kserve's Raw Deployment Mode + +KServe supports `RawDeployment` mode to enable `InferenceService` deployment with Kubernetes resources [`Deployment`](https://kubernetes.io/docs/concepts/workloads/controllers/deployment), [`Service`](https://kubernetes.io/docs/concepts/services-networking/service), [`Ingress`](https://kubernetes.io/docs/concepts/services-networking/ingress) and [`Horizontal Pod Autoscaler`](https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale). Comparing to serverless deployment it unlocks Knative limitations such as mounting multiple volumes, on the other hand `Scale down and from Zero` is not supported in `RawDeployment` mode. + +### HPA in Raw Deployment + +When using Kserve with the `RawDeployment` mode, Knative is not installed. In this mode, if you deploy an `InferenceService`, Kserve uses **Kubernetes’ Horizontal Pod Autoscaler (HPA)** for autoscaling instead of **Knative Pod Autoscaler (KPA)**. For more information about Kserve's autoscaler, you can refer [`this`](https://kserve.github.io/website/master/modelserving/v1beta1/torchserve/#knative-autoscaler) + + +=== "Old Schema" + + ```yaml + apiVersion: "serving.kserve.io/v1beta1" + kind: "InferenceService" + metadata: + name: "sklearn-iris-hpa" + annotations: + serving.kserve.io/deploymentMode: RawDeployment + serving.kserve.io/autoscalerClass: hpa + serving.kserve.io/metric: cpu + serving.kserve.io/targetUtilizationPercentage: "80" + spec: + predictor: + sklearn: + storageUri: "gs://kfserving-examples/models/sklearn/1.0/model" + ``` + +=== "New Schema" + + ```yaml + apiVersion: "serving.kserve.io/v1beta1" + kind: "InferenceService" + metadata: + name: "sklearn-iris-hpa" + annotations: + serving.kserve.io/deploymentMode: RawDeployment + serving.kserve.io/autoscalerClass: hpa + serving.kserve.io/metric: cpu + serving.kserve.io/targetUtilizationPercentage: "80" + spec: + predictor: + model: + modelFormat: + name: sklearn + storageUri: "gs://kfserving-examples/models/sklearn/1.0/model" + ``` + +### Disable HPA in Raw Deployment + +If you want to control the scaling of the deployment created by KServe inference service with an external tool like [`KEDA`](https://keda.sh/). You can disable KServe's creation of the **HPA** by replacing **external** value with autoscaler class annotaion that should be disable the creation of HPA + +=== "Old Schema" + + ```yaml + apiVersion: "serving.kserve.io/v1beta1" + kind: "InferenceService" + metadata: + annotations: + serving.kserve.io/deploymentMode: RawDeployment + serving.kserve.io/autoscalerClass: external + name: "sklearn-iris" + spec: + predictor: + sklearn: + storageUri: "gs://kfserving-examples/models/sklearn/1.0/model" + ``` + +=== "New Schema" + + ```yaml + apiVersion: "serving.kserve.io/v1beta1" + kind: "InferenceService" + metadata: + annotations: + serving.kserve.io/deploymentMode: RawDeployment + serving.kserve.io/autoscalerClass: external + name: "sklearn-iris" + spec: + predictor: + model: + modelFormat: + name: sklearn + storageUri: "gs://kfserving-examples/models/sklearn/1.0/model" + ``` From 257ad87f1c2330d12217e1255a20610766e32e95 Mon Sep 17 00:00:00 2001 From: Andrews Arokiam Date: Mon, 6 Nov 2023 14:33:32 +0530 Subject: [PATCH 2/3] Schema order changed. Signed-off-by: Andrews Arokiam --- docs/modelserving/autoscaling/autoscaling.md | 24 ++++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/docs/modelserving/autoscaling/autoscaling.md b/docs/modelserving/autoscaling/autoscaling.md index 17a3d02fb..6b0fbf0ef 100644 --- a/docs/modelserving/autoscaling/autoscaling.md +++ b/docs/modelserving/autoscaling/autoscaling.md @@ -510,7 +510,7 @@ KServe supports `RawDeployment` mode to enable `InferenceService` deployment wit When using Kserve with the `RawDeployment` mode, Knative is not installed. In this mode, if you deploy an `InferenceService`, Kserve uses **Kubernetes’ Horizontal Pod Autoscaler (HPA)** for autoscaling instead of **Knative Pod Autoscaler (KPA)**. For more information about Kserve's autoscaler, you can refer [`this`](https://kserve.github.io/website/master/modelserving/v1beta1/torchserve/#knative-autoscaler) -=== "Old Schema" +=== "New Schema" ```yaml apiVersion: "serving.kserve.io/v1beta1" @@ -524,11 +524,13 @@ When using Kserve with the `RawDeployment` mode, Knative is not installed. In th serving.kserve.io/targetUtilizationPercentage: "80" spec: predictor: - sklearn: + model: + modelFormat: + name: sklearn storageUri: "gs://kfserving-examples/models/sklearn/1.0/model" ``` -=== "New Schema" +=== "Old Schema" ```yaml apiVersion: "serving.kserve.io/v1beta1" @@ -542,9 +544,7 @@ When using Kserve with the `RawDeployment` mode, Knative is not installed. In th serving.kserve.io/targetUtilizationPercentage: "80" spec: predictor: - model: - modelFormat: - name: sklearn + sklearn: storageUri: "gs://kfserving-examples/models/sklearn/1.0/model" ``` @@ -552,7 +552,7 @@ When using Kserve with the `RawDeployment` mode, Knative is not installed. In th If you want to control the scaling of the deployment created by KServe inference service with an external tool like [`KEDA`](https://keda.sh/). You can disable KServe's creation of the **HPA** by replacing **external** value with autoscaler class annotaion that should be disable the creation of HPA -=== "Old Schema" +=== "New Schema" ```yaml apiVersion: "serving.kserve.io/v1beta1" @@ -564,11 +564,13 @@ If you want to control the scaling of the deployment created by KServe inference name: "sklearn-iris" spec: predictor: - sklearn: + model: + modelFormat: + name: sklearn storageUri: "gs://kfserving-examples/models/sklearn/1.0/model" ``` -=== "New Schema" +=== "Old Schema" ```yaml apiVersion: "serving.kserve.io/v1beta1" @@ -580,8 +582,6 @@ If you want to control the scaling of the deployment created by KServe inference name: "sklearn-iris" spec: predictor: - model: - modelFormat: - name: sklearn + sklearn: storageUri: "gs://kfserving-examples/models/sklearn/1.0/model" ``` From 83ffb79948361982657114bf048aa14bc4f08c6c Mon Sep 17 00:00:00 2001 From: Andrews Arokiam Date: Mon, 20 Jan 2025 08:01:45 +0530 Subject: [PATCH 3/3] code review changes Signed-off-by: Andrews Arokiam --- docs/modelserving/autoscaling/autoscaling.md | 85 ------------------ .../raw_deployment_autoscalling.md | 89 +++++++++++++++++++ mkdocs.yml | 1 + 3 files changed, 90 insertions(+), 85 deletions(-) create mode 100644 docs/modelserving/autoscaling/raw_deployment_autoscalling.md diff --git a/docs/modelserving/autoscaling/autoscaling.md b/docs/modelserving/autoscaling/autoscaling.md index 6b0fbf0ef..6061e25b1 100644 --- a/docs/modelserving/autoscaling/autoscaling.md +++ b/docs/modelserving/autoscaling/autoscaling.md @@ -500,88 +500,3 @@ This allows more flexibility in terms of the autoscaling configuration. In a typ ``` Apply the `autoscale-adv.yaml` to create the Autoscale InferenceService. The default for scaleMetric is `concurrency` and possible values are `concurrency`, `rps`, `cpu` and `memory`. - -## Autoscaler for Kserve's Raw Deployment Mode - -KServe supports `RawDeployment` mode to enable `InferenceService` deployment with Kubernetes resources [`Deployment`](https://kubernetes.io/docs/concepts/workloads/controllers/deployment), [`Service`](https://kubernetes.io/docs/concepts/services-networking/service), [`Ingress`](https://kubernetes.io/docs/concepts/services-networking/ingress) and [`Horizontal Pod Autoscaler`](https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale). Comparing to serverless deployment it unlocks Knative limitations such as mounting multiple volumes, on the other hand `Scale down and from Zero` is not supported in `RawDeployment` mode. - -### HPA in Raw Deployment - -When using Kserve with the `RawDeployment` mode, Knative is not installed. In this mode, if you deploy an `InferenceService`, Kserve uses **Kubernetes’ Horizontal Pod Autoscaler (HPA)** for autoscaling instead of **Knative Pod Autoscaler (KPA)**. For more information about Kserve's autoscaler, you can refer [`this`](https://kserve.github.io/website/master/modelserving/v1beta1/torchserve/#knative-autoscaler) - - -=== "New Schema" - - ```yaml - apiVersion: "serving.kserve.io/v1beta1" - kind: "InferenceService" - metadata: - name: "sklearn-iris-hpa" - annotations: - serving.kserve.io/deploymentMode: RawDeployment - serving.kserve.io/autoscalerClass: hpa - serving.kserve.io/metric: cpu - serving.kserve.io/targetUtilizationPercentage: "80" - spec: - predictor: - model: - modelFormat: - name: sklearn - storageUri: "gs://kfserving-examples/models/sklearn/1.0/model" - ``` - -=== "Old Schema" - - ```yaml - apiVersion: "serving.kserve.io/v1beta1" - kind: "InferenceService" - metadata: - name: "sklearn-iris-hpa" - annotations: - serving.kserve.io/deploymentMode: RawDeployment - serving.kserve.io/autoscalerClass: hpa - serving.kserve.io/metric: cpu - serving.kserve.io/targetUtilizationPercentage: "80" - spec: - predictor: - sklearn: - storageUri: "gs://kfserving-examples/models/sklearn/1.0/model" - ``` - -### Disable HPA in Raw Deployment - -If you want to control the scaling of the deployment created by KServe inference service with an external tool like [`KEDA`](https://keda.sh/). You can disable KServe's creation of the **HPA** by replacing **external** value with autoscaler class annotaion that should be disable the creation of HPA - -=== "New Schema" - - ```yaml - apiVersion: "serving.kserve.io/v1beta1" - kind: "InferenceService" - metadata: - annotations: - serving.kserve.io/deploymentMode: RawDeployment - serving.kserve.io/autoscalerClass: external - name: "sklearn-iris" - spec: - predictor: - model: - modelFormat: - name: sklearn - storageUri: "gs://kfserving-examples/models/sklearn/1.0/model" - ``` - -=== "Old Schema" - - ```yaml - apiVersion: "serving.kserve.io/v1beta1" - kind: "InferenceService" - metadata: - annotations: - serving.kserve.io/deploymentMode: RawDeployment - serving.kserve.io/autoscalerClass: external - name: "sklearn-iris" - spec: - predictor: - sklearn: - storageUri: "gs://kfserving-examples/models/sklearn/1.0/model" - ``` diff --git a/docs/modelserving/autoscaling/raw_deployment_autoscalling.md b/docs/modelserving/autoscaling/raw_deployment_autoscalling.md new file mode 100644 index 000000000..19e250e64 --- /dev/null +++ b/docs/modelserving/autoscaling/raw_deployment_autoscalling.md @@ -0,0 +1,89 @@ +## Autoscaler for Kserve's Raw Deployment Mode + +KServe supports `RawDeployment` mode to enable `InferenceService` deployment with Kubernetes resources [`Deployment`](https://kubernetes.io/docs/concepts/workloads/controllers/deployment), [`Service`](https://kubernetes.io/docs/concepts/services-networking/service), [`Ingress`](https://kubernetes.io/docs/concepts/services-networking/ingress) and [`Horizontal Pod Autoscaler`](https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale). Comparing to serverless deployment it unlocks Knative limitations such as mounting multiple volumes, on the other hand `Scale down and from Zero` is not supported in `RawDeployment` mode. + +### HPA in Raw Deployment + +When using Kserve with the `RawDeployment` mode, Knative is not installed. In this mode, if you deploy an `InferenceService`, Kserve uses **Kubernetes’ Horizontal Pod Autoscaler (HPA)** for autoscaling instead of **Knative Pod Autoscaler (KPA)**. For more information about Kserve's autoscaler, you can refer [`this`](https://kserve.github.io/website/master/modelserving/v1beta1/torchserve/#knative-autoscaler) + + +=== "New Schema" + + ```yaml + apiVersion: "serving.kserve.io/v1beta1" + kind: "InferenceService" + metadata: + name: "sklearn-iris-hpa" + annotations: + serving.kserve.io/deploymentMode: RawDeployment + serving.kserve.io/autoscalerClass: hpa + spec: + predictor: + scaleTarget: 80 + scaleMetric: cpu + model: + modelFormat: + name: sklearn + storageUri: "gs://kfserving-examples/models/sklearn/1.0/model" + ``` + +=== "Old Schema" + + ```yaml + apiVersion: "serving.kserve.io/v1beta1" + kind: "InferenceService" + metadata: + name: "sklearn-iris-hpa" + annotations: + serving.kserve.io/deploymentMode: RawDeployment + serving.kserve.io/autoscalerClass: hpa + serving.kserve.io/metric: cpu + serving.kserve.io/targetUtilizationPercentage: "80" + spec: + predictor: + sklearn: + storageUri: "gs://kfserving-examples/models/sklearn/1.0/model" + ``` + +`ScaleTarget` specifies the integer target value of the metric type the Autoscaler watches for. concurrency and rps targets are supported by Knative Pod Autoscaler. you can refer [`this`](https://knative.dev/docs/serving/autoscaling/autoscaling-targets/). + +`ScaleMetric` defines the scaling metric type watched by autoscaler. Possible values are concurrency, rps, cpu, memory. concurrency, rps are supported via Knative Pod Autoscaler. you can refer [`this`](https://knative.dev/docs/serving/autoscaling/autoscaling-metrics). + + +### Disable HPA in Raw Deployment + +If you want to control the scaling of the deployment created by KServe inference service with an external tool like [`KEDA`](https://keda.sh/). You can disable KServe's creation of the **HPA** by replacing **external** value with autoscaler class annotaion that should be disable the creation of HPA + +=== "New Schema" + + ```yaml + apiVersion: "serving.kserve.io/v1beta1" + kind: "InferenceService" + metadata: + annotations: + serving.kserve.io/deploymentMode: RawDeployment + serving.kserve.io/autoscalerClass: external + name: "sklearn-iris" + spec: + predictor: + model: + modelFormat: + name: sklearn + storageUri: "gs://kfserving-examples/models/sklearn/1.0/model" + ``` + +=== "Old Schema" + + ```yaml + apiVersion: "serving.kserve.io/v1beta1" + kind: "InferenceService" + metadata: + annotations: + serving.kserve.io/deploymentMode: RawDeployment + serving.kserve.io/autoscalerClass: external + name: "sklearn-iris" + spec: + predictor: + sklearn: + storageUri: "gs://kfserving-examples/models/sklearn/1.0/model" + ``` diff --git a/mkdocs.yml b/mkdocs.yml index 346b6c88d..ca6286152 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -94,6 +94,7 @@ nav: - Inference Logger: modelserving/logger/logger.md - Autoscaling: - Inference Autoscaling: modelserving/autoscaling/autoscaling.md + - Raw Deployment Autoscaling: modelserving/autoscaling/raw_deployment_autoscaling.md - Node Scheduling: - Overview: modelserving/nodescheduling/overview.md - InferenceService Node Scheduling: modelserving/nodescheduling/inferenceservicenodescheduling.md