From 351ae354cbcab0d7aa01c84a97b284f39a077308 Mon Sep 17 00:00:00 2001 From: Tim Gross Date: Tue, 25 Feb 2025 15:20:09 -0500 Subject: [PATCH] upgrade tests: add CSI workload Add an upgrade test workload for CSI with the AWS EBS plugin. In order to validate this workload, we'll need to deploy the plugin jobs and then register volumes with them. So this extends the `run_workloads` module to allow for a "post script" to be run after a given job has been deployed. We can use that as a model for other test workloads. Ref: https://hashicorp.atlassian.net/browse/NET-12217 --- enos/enos-scenario-upgrade.hcl | 27 +++++-- .../jobs/plugin-aws-ebs-controller.nomad.hcl | 45 +++++++++++ .../jobs/plugin-aws-ebs-nodes.nomad.hcl | 48 ++++++++++++ .../jobs/raw-exec-system.nomad.hcl | 3 +- .../run_workloads/jobs/wants-volume.nomad.hcl | 75 +++++++++++++++++++ enos/modules/run_workloads/main.tf | 24 +++++- .../run_workloads/scripts/volume.hcl.tpl | 13 ++++ .../scripts/wait_for_ebs_plugin.sh | 66 ++++++++++++++++ enos/modules/run_workloads/variables.tf | 6 ++ 9 files changed, 300 insertions(+), 7 deletions(-) create mode 100644 enos/modules/run_workloads/jobs/plugin-aws-ebs-controller.nomad.hcl create mode 100644 enos/modules/run_workloads/jobs/plugin-aws-ebs-nodes.nomad.hcl create mode 100644 enos/modules/run_workloads/jobs/wants-volume.nomad.hcl create mode 100644 enos/modules/run_workloads/scripts/volume.hcl.tpl create mode 100755 enos/modules/run_workloads/scripts/wait_for_ebs_plugin.sh diff --git a/enos/enos-scenario-upgrade.hcl b/enos/enos-scenario-upgrade.hcl index 1cc91aa77d7..4da29d42fa9 100644 --- a/enos/enos-scenario-upgrade.hcl +++ b/enos/enos-scenario-upgrade.hcl @@ -89,18 +89,35 @@ scenario "upgrade" { module = module.run_workloads variables { - nomad_addr = step.provision_cluster.nomad_addr - ca_file = step.provision_cluster.ca_file - cert_file = step.provision_cluster.cert_file - key_file = step.provision_cluster.key_file - nomad_token = step.provision_cluster.nomad_token + nomad_addr = step.provision_cluster.nomad_addr + ca_file = step.provision_cluster.ca_file + cert_file = step.provision_cluster.cert_file + key_file = step.provision_cluster.key_file + nomad_token = step.provision_cluster.nomad_token + cluster_name = local.cluster_name + workloads = { + # deploy these first + csi_plugin_ebs_controller = { + job_spec = "jobs/plugin-aws-ebs-controller.nomad.hcl" + alloc_count = 2 + type = "service" + } + csi_plugin_ebs_node = { + job_spec = "jobs/plugin-aws-ebs-nodes.nomad.hcl" + alloc_count = 0 + type = "system" + post_script = "scripts/wait_for_ebs_plugin.sh" + } + service_raw_exec = { job_spec = "jobs/raw-exec-service.nomad.hcl", alloc_count = 3, type = "service" } service_docker = { job_spec = "jobs/docker-service.nomad.hcl", alloc_count = 3, type = "service" } system_docker = { job_spec = "jobs/docker-system.nomad.hcl", alloc_count = 0, type = "system" } batch_docker = { job_spec = "jobs/docker-batch.nomad.hcl", alloc_count = 3, type = "batch" } batch_raw_exec = { job_spec = "jobs/raw-exec-batch.nomad.hcl", alloc_count = 3, type = "batch" } system_raw_exec = { job_spec = "jobs/raw-exec-system.nomad.hcl", alloc_count = 0, type = "system" } + wants_csi = { job_spec = "jobs/wants-volume.nomad.hcl", alloc_count = 1, type = "service" } + } } diff --git a/enos/modules/run_workloads/jobs/plugin-aws-ebs-controller.nomad.hcl b/enos/modules/run_workloads/jobs/plugin-aws-ebs-controller.nomad.hcl new file mode 100644 index 00000000000..0e7bbbf2bb0 --- /dev/null +++ b/enos/modules/run_workloads/jobs/plugin-aws-ebs-controller.nomad.hcl @@ -0,0 +1,45 @@ +# Copyright (c) HashiCorp, Inc. +# SPDX-License-Identifier: BUSL-1.1 +variable "alloc_count" { + type = number + default = 2 +} + +job "plugin-aws-ebs-controller" { + + constraint { + attribute = "${attr.kernel.name}" + value = "linux" + } + + group "controller" { + + count = var.alloc_count + + task "plugin" { + driver = "docker" + + config { + image = "public.ecr.aws/ebs-csi-driver/aws-ebs-csi-driver:v1.33.0" + + args = [ + "controller", + "--endpoint=${CSI_ENDPOINT}", + "--logtostderr", + "--v=5", + ] + } + + csi_plugin { + id = "aws-ebs0" + type = "controller" + mount_dir = "/csi" + } + + resources { + cpu = 100 + memory = 256 + } + } + } +} diff --git a/enos/modules/run_workloads/jobs/plugin-aws-ebs-nodes.nomad.hcl b/enos/modules/run_workloads/jobs/plugin-aws-ebs-nodes.nomad.hcl new file mode 100644 index 00000000000..0bb20e1c628 --- /dev/null +++ b/enos/modules/run_workloads/jobs/plugin-aws-ebs-nodes.nomad.hcl @@ -0,0 +1,48 @@ +# Copyright (c) HashiCorp, Inc. +# SPDX-License-Identifier: BUSL-1.1 + +# this variable is not used but required by runner +variable "alloc_count" { + type = number + default = 1 +} + +job "plugin-aws-ebs-nodes" { + + constraint { + attribute = "${attr.kernel.name}" + value = "linux" + } + + type = "system" + + group "nodes" { + task "plugin" { + driver = "docker" + + config { + image = "public.ecr.aws/ebs-csi-driver/aws-ebs-csi-driver:v1.33.0" + + args = [ + "node", + "--endpoint=${CSI_ENDPOINT}", + "--logtostderr", + "--v=5", + ] + + privileged = true + } + + csi_plugin { + id = "aws-ebs0" + type = "node" + mount_dir = "/csi" + } + + resources { + cpu = 100 + memory = 256 + } + } + } +} diff --git a/enos/modules/run_workloads/jobs/raw-exec-system.nomad.hcl b/enos/modules/run_workloads/jobs/raw-exec-system.nomad.hcl index 6636d51522d..731b4c2062a 100644 --- a/enos/modules/run_workloads/jobs/raw-exec-system.nomad.hcl +++ b/enos/modules/run_workloads/jobs/raw-exec-system.nomad.hcl @@ -1,6 +1,7 @@ # Copyright (c) HashiCorp, Inc. # SPDX-License-Identifier: BUSL-1.1 +# this variable is not used but required by runner variable "alloc_count" { type = number default = 1 @@ -24,7 +25,7 @@ job "system-raw-exec" { #!/bin/bash while true; do - sleep 30000 + sleep 30000 done EOH destination = "local/runme.sh" diff --git a/enos/modules/run_workloads/jobs/wants-volume.nomad.hcl b/enos/modules/run_workloads/jobs/wants-volume.nomad.hcl new file mode 100644 index 00000000000..75fe9da7cf9 --- /dev/null +++ b/enos/modules/run_workloads/jobs/wants-volume.nomad.hcl @@ -0,0 +1,75 @@ +# Copyright (c) HashiCorp, Inc. +# SPDX-License-Identifier: BUSL-1.1 + +variable "alloc_count" { + type = number + default = 1 +} + +# a job that mounts an EBS volume and writes its job ID as a file +job "wants-ebs-volume" { + + constraint { + attribute = "${attr.kernel.name}" + value = "linux" + } + + group "group" { + count = var.alloc_count + + volume "test" { + type = "csi" + source = "ebsTestVolume" + attachment_mode = "file-system" + access_mode = "single-node-writer" + } + + task "task" { + driver = "docker" + + config { + image = "busybox:1" + command = "httpd" + args = ["-vv", "-f", "-p", "8001", "-h", "/local"] + } + + volume_mount { + volume = "test" + destination = "${NOMAD_TASK_DIR}/test" + read_only = false + } + + resources { + cpu = 100 + memory = 64 + } + } + + task "sidecar" { + driver = "docker" + + config { + image = "busybox:1" + command = "/bin/sh" + args = ["-c", "echo '${NOMAD_ALLOC_ID}' > ${NOMAD_TASK_DIR}/index.html"] + } + + lifecycle { + hook = "poststart" + sidecar = false + } + + volume_mount { + volume = "test" + destination = "${NOMAD_TASK_DIR}/test" + read_only = false + } + + resources { + cpu = 10 + memory = 10 + } + + } + } +} diff --git a/enos/modules/run_workloads/main.tf b/enos/modules/run_workloads/main.tf index e3e3fec73b5..d47bee16b07 100644 --- a/enos/modules/run_workloads/main.tf +++ b/enos/modules/run_workloads/main.tf @@ -16,12 +16,31 @@ locals { NOMAD_CLIENT_CERT = var.cert_file NOMAD_CLIENT_KEY = var.key_file NOMAD_TOKEN = var.nomad_token + CLUSTER_NAME = var.cluster_name } system_job_count = length({ for k, v in var.workloads : k => v if v.type == "system" }) service_batch_allocs = sum([for wl in var.workloads : wl.alloc_count]) } +# need to get the zone for one of the instances +data "aws_instance" "server0" { + filter { + name = "tag:Name" + values = ["${var.cluster_name}-server0"] + } +} + +# test volume we'll register for the CSI workload +resource "aws_ebs_volume" "test_volume" { + availability_zone = data.aws_instance.server0.availability_zone + size = 10 + + tags = { + Cluster = "${var.cluster_name}" + } +} + resource "enos_local_exec" "wait_for_nomad_api" { environment = local.nomad_env @@ -52,5 +71,8 @@ resource "enos_local_exec" "workloads" { environment = local.nomad_env - inline = ["nomad job run -var alloc_count=${each.value.alloc_count} ${path.module}/${each.value.job_spec}"] + inline = [ + "nomad job run -var alloc_count=${each.value.alloc_count} ${path.module}/${each.value.job_spec}", + each.value.post_script != null ? abspath("${path.module}/${each.value.post_script}") : "echo ok" + ] } diff --git a/enos/modules/run_workloads/scripts/volume.hcl.tpl b/enos/modules/run_workloads/scripts/volume.hcl.tpl new file mode 100644 index 00000000000..99ee051fb71 --- /dev/null +++ b/enos/modules/run_workloads/scripts/volume.hcl.tpl @@ -0,0 +1,13 @@ +# Copyright (c) HashiCorp, Inc. +# SPDX-License-Identifier: BUSL-1.1 + +type = "csi" +id = "ebsTestVolume" +name = "IDEMPOTENCY_TOKEN" +external_id = "EXTERNAL_ID" +plugin_id = "aws-ebs0" + +capability { + access_mode = "single-node-writer" + attachment_mode = "file-system" +} diff --git a/enos/modules/run_workloads/scripts/wait_for_ebs_plugin.sh b/enos/modules/run_workloads/scripts/wait_for_ebs_plugin.sh new file mode 100755 index 00000000000..193a1327623 --- /dev/null +++ b/enos/modules/run_workloads/scripts/wait_for_ebs_plugin.sh @@ -0,0 +1,66 @@ +#!/usr/bin/env bash +# Copyright (c) HashiCorp, Inc. +# SPDX-License-Identifier: BUSL-1.1 + +set -euo pipefail + +# note: it can a very long time for plugins to come up +TIMEOUT=60 +INTERVAL=2 +last_error= +start_time=$(date +%s) + +checkPlugin() { + local pluginStatus foundControllers foundNodes + pluginStatus=$(nomad plugin status aws-ebs0) || { + last_error="could not read CSI plugin status" + return 1 + } + + foundControllers=$(echo "$pluginStatus" | awk -F'= +' '/Controllers Healthy/{print $2}') + if [[ "$foundControllers" != 2 ]]; then + echo "$foundControllers" + last_error="expected plugin to have 2 healthy controllers, found $foundControllers" + return 1 + fi + + foundNodes=$(echo "$pluginStatus" | awk -F'= +' '/Nodes Healthy/{print $2}') + if [[ "$foundNodes" == 0 ]]; then + last_error="expected plugin to have at least 1 healthy nodes, found none" + return 1 + fi + return 0 +} + +registerVolume() { + local externalID idempotencyToken + idempotencyToken=$(uuidgen) + + externalID=$(aws ec2 describe-volumes + --filters "Name=tag:Cluster,Values=$CLUSTER_NAME" + --output=json | jq '.Volumes[0].VolumeId') || { + echo "Could not find volume for $CLUSTER_NAME" + exit 1 + } + + sed -e "s/IDEMPOTENCY_TOKEN/$idempotencyToken/" \ + -e "s/EXTERNAL_ID/$externalID/" \ + volume.hcl.tpl | nomad volume register - +} + +while : +do + checkPlugin && break + + current_time=$(date +%s) + elapsed_time=$((current_time - start_time)) + if [ "$elapsed_time" -ge "$TIMEOUT" ]; then + echo "Error: CSI plugin did not become available within $TIMEOUT seconds." + exit 1 + fi + + sleep "$INTERVAL" +done + +registerVolume +nomad volume status -type csi diff --git a/enos/modules/run_workloads/variables.tf b/enos/modules/run_workloads/variables.tf index 6bd96875ec3..f1bf1cf4ecb 100644 --- a/enos/modules/run_workloads/variables.tf +++ b/enos/modules/run_workloads/variables.tf @@ -35,5 +35,11 @@ variable "workloads" { job_spec = string alloc_count = number type = string + post_script = optional(string) })) } + +variable "cluster_name" { + description = "The name of the cluster, which we need to find the AZ" + type = string +}