Skip to content

Increase resources for SPS core components #359

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Mar 19, 2025
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,7 @@ crash.*.log
# to change depending on the environment.
*.tfvars
*.tfvars.json
**/*.tfvars

# Ignore override files as they are usually used to override resources locally and so
# are not checked in
Expand Down
4 changes: 2 additions & 2 deletions airflow/dags/cwl_dag.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,8 +69,8 @@
is_paused_upon_creation=False,
catchup=False,
schedule=None,
max_active_runs=100,
max_active_tasks=300,
max_active_runs=1000,
max_active_tasks=3000,
default_args=dag_default_args,
params={
"cwl_workflow": Param(
Expand Down
66 changes: 60 additions & 6 deletions airflow/helm/values.tmpl.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -93,10 +93,11 @@ scheduler:
values: ["on-demand"]
- key: "karpenter.k8s.aws/instance-family"
operator: "In"
values: ["c6i", "c5"] # Choosing compute-optimized instances
# values: ["c6i", "c5"] # Choosing compute-optimized instances
values: ["r5"] # Choosing memory-optimized instance
- key: "karpenter.k8s.aws/instance-cpu"
operator: "In"
values: ["2", "4"] # Scheduler might benefit from higher CPU
values: ["8"]
topologySpreadConstraints:
- maxSkew: 1
topologyKey: "topology.kubernetes.io/zone"
Expand All @@ -117,13 +118,47 @@ triggerer:
keda:
enabled: true
minReplicaCount: 1
nodeSelector:
"karpenter.sh/nodepool": "airflow-core-components"
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: "karpenter.sh/capacity-type"
operator: "In"
values: [ "on-demand" ]
- key: "karpenter.k8s.aws/instance-family"
operator: "In"
# values: ["c6i", "c5"] # Choosing compute-optimized instances
values: [ "r5" ] # Choosing memory-optimized instance
- key: "karpenter.k8s.aws/instance-cpu"
operator: "In"
values: [ "8" ] # Scheduler might benefit from higher CPU

postgresql:
enabled: false

pgbouncer:
enabled: true
replicas: 3
nodeSelector:
"karpenter.sh/nodepool": "airflow-core-components"
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: "karpenter.sh/capacity-type"
operator: "In"
values: [ "on-demand" ]
- key: "karpenter.k8s.aws/instance-family"
operator: "In"
# values: ["c6i", "c5"] # Choosing compute-optimized instances
values: [ "r5" ] # Choosing memory-optimized instance
- key: "karpenter.k8s.aws/instance-cpu"
operator: "In"
values: [ "8" ] # Scheduler might benefit from higher CPU

webserverSecretKeySecretName: ${webserver_secret_name}

Expand All @@ -147,10 +182,11 @@ webserver:
values: ["on-demand"]
- key: "karpenter.k8s.aws/instance-family"
operator: "In"
values: ["c6i", "c5"] # Choosing compute-optimized instances
# values: ["c6i", "c5"] # Choosing compute-optimized instances
values: ["r5"] # Choosing memory-optimized instance
- key: "karpenter.k8s.aws/instance-cpu"
operator: "In"
values: ["2", "4"] # Balancing between CPU and memory
values: ["8"] # Balancing between CPU and memory
topologySpreadConstraints:
- maxSkew: 1
topologyKey: "topology.kubernetes.io/zone"
Expand Down Expand Up @@ -184,10 +220,11 @@ workers:
- matchExpressions:
- key: "karpenter.k8s.aws/instance-family"
operator: "In"
values: ["t3"]
# values: ["c6i", "c5"] # Choosing compute-optimized instances
values: ["r5"] # Choosing memory-optimized instance
- key: "karpenter.k8s.aws/instance-cpu"
operator: "In"
values: ["2", "4"]
values: ["8"]
topologySpreadConstraints:
- maxSkew: 1
topologyKey: "topology.kubernetes.io/zone"
Expand Down Expand Up @@ -263,6 +300,23 @@ dags:
dagProcessor:
enabled: true
replicas: 3
nodeSelector:
"karpenter.sh/nodepool": "airflow-core-components"
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: "karpenter.sh/capacity-type"
operator: "In"
values: [ "on-demand" ]
- key: "karpenter.k8s.aws/instance-family"
operator: "In"
# values: ["c6i", "c5"] # Choosing compute-optimized instances
values: [ "r5" ] # Choosing memory-optimized instance
- key: "karpenter.k8s.aws/instance-cpu"
operator: "In"
values: [ "8" ] # Scheduler might benefit from higher CPU

env:
- name: "AIRFLOW_VAR_KUBERNETES_PIPELINE_NAMESPACE"
Expand Down
36 changes: 23 additions & 13 deletions airflow/plugins/unity_sps_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

# Note: each Pod is assigned the same label to assure that (via the anti-affinity requirements)
# two Pods with the same label cannot run on the same Node
SPS_DOCKER_CWL_IMAGE = "ghcr.io/unity-sds/unity-sps/sps-docker-cwl:2.5.5"
SPS_DOCKER_CWL_IMAGE = "ghcr.io/unity-sds/unity-sps/sps-docker-cwl:2.5.6"

NODE_POOL_DEFAULT = "airflow-kubernetes-pod-operator"
NODE_POOL_HIGH_WORKLOAD = "airflow-kubernetes-pod-operator-high-workload"
Expand All @@ -27,11 +27,16 @@
LOG_LEVEL_TYPE = {10: "DEBUG", 20: "INFO", 30: "WARNING", 40: "ERROR", 50: "CRITICAL"}

EC2_TYPES = {
"t3.micro": {
"desc": "General Purpose",
"cpu": 1,
"memory": 1,
},
# "t3.nano": {
# "desc": "General Purpose",
# "cpu": 1,
# "memory": 0.5,
# },
# "t3.micro": {
# "desc": "General Purpose",
# "cpu": 2,
# "memory": 1,
# },
"t3.small": {
"desc": "General Purpose",
"cpu": 2,
Expand Down Expand Up @@ -97,23 +102,28 @@
"cpu": 32,
"memory": 64,
},
"m5ad.large": {
"desc": "General Purpose with SSD storage",
"cpu": 2,
"memory": 8,
"c6i.12xlarge": {
"desc": "Compute Optimized",
"cpu": 48,
"memory": 96,
},
"c6i.16xlarge": {
"desc": "Compute Optimized",
"cpu": 64,
"memory": 128,
},
"m5ad.xlarge": {
"desc": "General Purpose with SSD storage",
"desc": "General Purpose with SSD local storage",
"cpu": 4,
"memory": 16,
},
"m5ad.2xlarge": {
"desc": "General Purpose with SSD storage",
"desc": "General Purpose with SSD local storage",
"cpu": 8,
"memory": 32,
},
"m5ad.4xlarge": {
"desc": "General Purpose with SSD storage",
"desc": "General Purpose with SSD local storage",
"cpu": 16,
"memory": 64,
},
Expand Down
19 changes: 0 additions & 19 deletions terraform-unity/.terraform.lock.hcl

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 3 additions & 3 deletions terraform-unity/README.md

Large diffs are not rendered by default.

8 changes: 6 additions & 2 deletions terraform-unity/modules/terraform-unity-sps-eks/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,33 +7,37 @@
|------|---------|
| <a name="requirement_terraform"></a> [terraform](#requirement\_terraform) | ~> 1.8.2 |
| <a name="requirement_aws"></a> [aws](#requirement\_aws) | 5.67.0 |
| <a name="requirement_null"></a> [null](#requirement\_null) | 3.2.3 |

## Providers

| Name | Version |
|------|---------|
| <a name="provider_aws"></a> [aws](#provider\_aws) | 5.67.0 |
| <a name="provider_null"></a> [null](#provider\_null) | 3.2.3 |

## Modules

| Name | Source | Version |
|------|--------|---------|
| <a name="module_unity-eks"></a> [unity-eks](#module\_unity-eks) | git::https://github.com/unity-sds/unity-cs-infra.git//terraform-unity-eks_module | unity-sps-2.4.0 |
| <a name="module_unity-eks"></a> [unity-eks](#module\_unity-eks) | git::https://github.com/unity-sds/unity-cs-infra.git//terraform-unity-eks_module | unity-sps-2.4.1-hotfix1 |

## Resources

| Name | Type |
|------|------|
| [aws_iam_role_policy.sps_airflow_eks_inline_policy](https://registry.terraform.io/providers/hashicorp/aws/5.67.0/docs/resources/iam_role_policy) | resource |
| [null_resource.eks_post_deployment_actions](https://registry.terraform.io/providers/hashicorp/null/3.2.3/docs/resources/resource) | resource |
| [aws_caller_identity.current](https://registry.terraform.io/providers/hashicorp/aws/5.67.0/docs/data-sources/caller_identity) | data source |
| [aws_region.current](https://registry.terraform.io/providers/hashicorp/aws/5.67.0/docs/data-sources/region) | data source |

## Inputs

| Name | Description | Type | Default | Required |
|------|-------------|------|---------|:--------:|
| <a name="input_deployment_name"></a> [deployment\_name](#input\_deployment\_name) | The name of the deployment. | `string` | `""` | no |
| <a name="input_installprefix"></a> [installprefix](#input\_installprefix) | The install prefix for the service area (unused) | `string` | `""` | no |
| <a name="input_nodegroups"></a> [nodegroups](#input\_nodegroups) | A map of node group configurations | <pre>map(object({<br> create_iam_role = optional(bool)<br> iam_role_arn = optional(string)<br> ami_id = optional(string)<br> min_size = optional(number)<br> max_size = optional(number)<br> desired_size = optional(number)<br> instance_types = optional(list(string))<br> capacity_type = optional(string)<br> enable_bootstrap_user_data = optional(bool)<br> metadata_options = optional(map(any))<br> block_device_mappings = optional(map(object({<br> device_name = string<br> ebs = object({<br> volume_size = number<br> volume_type = string<br> encrypted = bool<br> delete_on_termination = bool<br> })<br> })))<br> }))</pre> | <pre>{<br> "defaultGroup": {<br> "block_device_mappings": {<br> "xvda": {<br> "device_name": "/dev/xvda",<br> "ebs": {<br> "delete_on_termination": true,<br> "encrypted": true,<br> "volume_size": 100,<br> "volume_type": "gp2"<br> }<br> }<br> },<br> "desired_size": 1,<br> "instance_types": [<br> "t3.xlarge"<br> ],<br> "max_size": 1,<br> "metadata_options": {<br> "http_endpoint": "enabled",<br> "http_put_response_hop_limit": 3<br> },<br> "min_size": 1<br> }<br>}</pre> | no |
| <a name="input_nodegroups"></a> [nodegroups](#input\_nodegroups) | A map of node group configurations | <pre>map(object({<br> create_iam_role = optional(bool)<br> iam_role_arn = optional(string)<br> ami_id = optional(string)<br> min_size = optional(number)<br> max_size = optional(number)<br> desired_size = optional(number)<br> instance_types = optional(list(string))<br> capacity_type = optional(string)<br> enable_bootstrap_user_data = optional(bool)<br> metadata_options = optional(map(any))<br> block_device_mappings = optional(map(object({<br> device_name = string<br> ebs = object({<br> volume_size = number<br> volume_type = string<br> encrypted = bool<br> delete_on_termination = bool<br> })<br> })))<br> }))</pre> | <pre>{<br> "defaultGroup": {<br> "block_device_mappings": {<br> "xvda": {<br> "device_name": "/dev/xvda",<br> "ebs": {<br> "delete_on_termination": true,<br> "encrypted": true,<br> "volume_size": 100,<br> "volume_type": "gp2"<br> }<br> }<br> },<br> "desired_size": 1,<br> "instance_types": [<br> "t3.2xlarge"<br> ],<br> "max_size": 1,<br> "metadata_options": {<br> "http_endpoint": "enabled",<br> "http_put_response_hop_limit": 3<br> },<br> "min_size": 1<br> }<br>}</pre> | no |
| <a name="input_project"></a> [project](#input\_project) | The project or mission deploying Unity SPS | `string` | `"unity"` | no |
| <a name="input_release"></a> [release](#input\_release) | The software release version. | `string` | `"24.4"` | no |
| <a name="input_service_area"></a> [service\_area](#input\_service\_area) | The service area owner of the resources being deployed | `string` | `"sps"` | no |
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ variable "nodegroups" {
}))
default = {
defaultGroup = {
instance_types = ["t3.xlarge"]
instance_types = ["t3.2xlarge"]
min_size = 1
max_size = 1
desired_size = 1
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ resource "kubernetes_deployment" "ogc_processes_api" {
match_expressions {
key = "karpenter.k8s.aws/instance-cpu"
operator = "In"
values = ["2", "4"]
values = ["4"]
}
}
}
Expand Down
Loading