From 484d0fe77abf2cf8cf3521cb7647bd1c101dd6d8 Mon Sep 17 00:00:00 2001 From: Tim Gross Date: Tue, 25 Feb 2025 15:20:09 -0500 Subject: [PATCH] upgrade tests: add CSI workload Add an upgrade test workload for CSI with the AWS EBS plugin. In order to validate this workload, we'll need to deploy the plugin jobs and then register volumes with them. So this extends the `run_workloads` module to allow for a "post script" to be run after a given job has been deployed. We can use that as a model for other test workloads. Ref: https://hashicorp.atlassian.net/browse/NET-12217 --- enos/enos-scenario-upgrade.hcl | 28 +++++-- enos/enos-vars.hcl | 5 ++ .../jobs/plugin-aws-ebs-controller.nomad.hcl | 45 +++++++++++ .../jobs/plugin-aws-ebs-nodes.nomad.hcl | 48 ++++++++++++ .../jobs/raw-exec-system.nomad.hcl | 3 +- .../run_workloads/jobs/wants-volume.nomad.hcl | 75 +++++++++++++++++++ enos/modules/run_workloads/main.tf | 30 +++++++- .../run_workloads/scripts/volume.hcl.tpl | 13 ++++ .../scripts/wait_for_ebs_plugin.sh | 68 +++++++++++++++++ enos/modules/run_workloads/variables.tf | 6 ++ 10 files changed, 312 insertions(+), 9 deletions(-) create mode 100644 enos/modules/run_workloads/jobs/plugin-aws-ebs-controller.nomad.hcl create mode 100644 enos/modules/run_workloads/jobs/plugin-aws-ebs-nodes.nomad.hcl create mode 100644 enos/modules/run_workloads/jobs/wants-volume.nomad.hcl create mode 100644 enos/modules/run_workloads/scripts/volume.hcl.tpl create mode 100755 enos/modules/run_workloads/scripts/wait_for_ebs_plugin.sh diff --git a/enos/enos-scenario-upgrade.hcl b/enos/enos-scenario-upgrade.hcl index 1cc91aa77d7..6dfd37aad9b 100644 --- a/enos/enos-scenario-upgrade.hcl +++ b/enos/enos-scenario-upgrade.hcl @@ -76,6 +76,7 @@ scenario "upgrade" { consul_license = var.consul_license volumes = false region = var.aws_region + availability_zone = var.aws_az instance_arch = matrix.arch } } @@ -89,18 +90,35 @@ scenario "upgrade" { module = module.run_workloads variables { - nomad_addr = step.provision_cluster.nomad_addr - ca_file = step.provision_cluster.ca_file - cert_file = step.provision_cluster.cert_file - key_file = step.provision_cluster.key_file - nomad_token = step.provision_cluster.nomad_token + nomad_addr = step.provision_cluster.nomad_addr + ca_file = step.provision_cluster.ca_file + cert_file = step.provision_cluster.cert_file + key_file = step.provision_cluster.key_file + nomad_token = step.provision_cluster.nomad_token + availability_zone = var.aws_az + workloads = { + # deploy these first + csi_plugin_ebs_controller = { + job_spec = "jobs/plugin-aws-ebs-controller.nomad.hcl" + alloc_count = 2 + type = "service" + } + csi_plugin_ebs_node = { + job_spec = "jobs/plugin-aws-ebs-nodes.nomad.hcl" + alloc_count = 0 + type = "system" + post_script = "scripts/wait_for_ebs_plugin.sh" + } + service_raw_exec = { job_spec = "jobs/raw-exec-service.nomad.hcl", alloc_count = 3, type = "service" } service_docker = { job_spec = "jobs/docker-service.nomad.hcl", alloc_count = 3, type = "service" } system_docker = { job_spec = "jobs/docker-system.nomad.hcl", alloc_count = 0, type = "system" } batch_docker = { job_spec = "jobs/docker-batch.nomad.hcl", alloc_count = 3, type = "batch" } batch_raw_exec = { job_spec = "jobs/raw-exec-batch.nomad.hcl", alloc_count = 3, type = "batch" } system_raw_exec = { job_spec = "jobs/raw-exec-system.nomad.hcl", alloc_count = 0, type = "system" } + wants_csi = { job_spec = "jobs/wants-volume.nomad.hcl", alloc_count = 1, type = "service" } + } } diff --git a/enos/enos-vars.hcl b/enos/enos-vars.hcl index 18058bb83a6..120478df094 100644 --- a/enos/enos-vars.hcl +++ b/enos/enos-vars.hcl @@ -54,3 +54,8 @@ variable "aws_region" { description = "The AWS region to deploy to." default = "us-east-1" } + +variable "aws_az" { + description = "The AWS availability zone to deploy to." + default = "us-east-1b" +} diff --git a/enos/modules/run_workloads/jobs/plugin-aws-ebs-controller.nomad.hcl b/enos/modules/run_workloads/jobs/plugin-aws-ebs-controller.nomad.hcl new file mode 100644 index 00000000000..0e7bbbf2bb0 --- /dev/null +++ b/enos/modules/run_workloads/jobs/plugin-aws-ebs-controller.nomad.hcl @@ -0,0 +1,45 @@ +# Copyright (c) HashiCorp, Inc. +# SPDX-License-Identifier: BUSL-1.1 +variable "alloc_count" { + type = number + default = 2 +} + +job "plugin-aws-ebs-controller" { + + constraint { + attribute = "${attr.kernel.name}" + value = "linux" + } + + group "controller" { + + count = var.alloc_count + + task "plugin" { + driver = "docker" + + config { + image = "public.ecr.aws/ebs-csi-driver/aws-ebs-csi-driver:v1.33.0" + + args = [ + "controller", + "--endpoint=${CSI_ENDPOINT}", + "--logtostderr", + "--v=5", + ] + } + + csi_plugin { + id = "aws-ebs0" + type = "controller" + mount_dir = "/csi" + } + + resources { + cpu = 100 + memory = 256 + } + } + } +} diff --git a/enos/modules/run_workloads/jobs/plugin-aws-ebs-nodes.nomad.hcl b/enos/modules/run_workloads/jobs/plugin-aws-ebs-nodes.nomad.hcl new file mode 100644 index 00000000000..0bb20e1c628 --- /dev/null +++ b/enos/modules/run_workloads/jobs/plugin-aws-ebs-nodes.nomad.hcl @@ -0,0 +1,48 @@ +# Copyright (c) HashiCorp, Inc. +# SPDX-License-Identifier: BUSL-1.1 + +# this variable is not used but required by runner +variable "alloc_count" { + type = number + default = 1 +} + +job "plugin-aws-ebs-nodes" { + + constraint { + attribute = "${attr.kernel.name}" + value = "linux" + } + + type = "system" + + group "nodes" { + task "plugin" { + driver = "docker" + + config { + image = "public.ecr.aws/ebs-csi-driver/aws-ebs-csi-driver:v1.33.0" + + args = [ + "node", + "--endpoint=${CSI_ENDPOINT}", + "--logtostderr", + "--v=5", + ] + + privileged = true + } + + csi_plugin { + id = "aws-ebs0" + type = "node" + mount_dir = "/csi" + } + + resources { + cpu = 100 + memory = 256 + } + } + } +} diff --git a/enos/modules/run_workloads/jobs/raw-exec-system.nomad.hcl b/enos/modules/run_workloads/jobs/raw-exec-system.nomad.hcl index 6636d51522d..731b4c2062a 100644 --- a/enos/modules/run_workloads/jobs/raw-exec-system.nomad.hcl +++ b/enos/modules/run_workloads/jobs/raw-exec-system.nomad.hcl @@ -1,6 +1,7 @@ # Copyright (c) HashiCorp, Inc. # SPDX-License-Identifier: BUSL-1.1 +# this variable is not used but required by runner variable "alloc_count" { type = number default = 1 @@ -24,7 +25,7 @@ job "system-raw-exec" { #!/bin/bash while true; do - sleep 30000 + sleep 30000 done EOH destination = "local/runme.sh" diff --git a/enos/modules/run_workloads/jobs/wants-volume.nomad.hcl b/enos/modules/run_workloads/jobs/wants-volume.nomad.hcl new file mode 100644 index 00000000000..75fe9da7cf9 --- /dev/null +++ b/enos/modules/run_workloads/jobs/wants-volume.nomad.hcl @@ -0,0 +1,75 @@ +# Copyright (c) HashiCorp, Inc. +# SPDX-License-Identifier: BUSL-1.1 + +variable "alloc_count" { + type = number + default = 1 +} + +# a job that mounts an EBS volume and writes its job ID as a file +job "wants-ebs-volume" { + + constraint { + attribute = "${attr.kernel.name}" + value = "linux" + } + + group "group" { + count = var.alloc_count + + volume "test" { + type = "csi" + source = "ebsTestVolume" + attachment_mode = "file-system" + access_mode = "single-node-writer" + } + + task "task" { + driver = "docker" + + config { + image = "busybox:1" + command = "httpd" + args = ["-vv", "-f", "-p", "8001", "-h", "/local"] + } + + volume_mount { + volume = "test" + destination = "${NOMAD_TASK_DIR}/test" + read_only = false + } + + resources { + cpu = 100 + memory = 64 + } + } + + task "sidecar" { + driver = "docker" + + config { + image = "busybox:1" + command = "/bin/sh" + args = ["-c", "echo '${NOMAD_ALLOC_ID}' > ${NOMAD_TASK_DIR}/index.html"] + } + + lifecycle { + hook = "poststart" + sidecar = false + } + + volume_mount { + volume = "test" + destination = "${NOMAD_TASK_DIR}/test" + read_only = false + } + + resources { + cpu = 10 + memory = 10 + } + + } + } +} diff --git a/enos/modules/run_workloads/main.tf b/enos/modules/run_workloads/main.tf index e3e3fec73b5..bb94617a592 100644 --- a/enos/modules/run_workloads/main.tf +++ b/enos/modules/run_workloads/main.tf @@ -9,6 +9,9 @@ terraform { } } +resource "random_pet" "volume_tag" { +} + locals { nomad_env = { NOMAD_ADDR = var.nomad_addr @@ -16,12 +19,23 @@ locals { NOMAD_CLIENT_CERT = var.cert_file NOMAD_CLIENT_KEY = var.key_file NOMAD_TOKEN = var.nomad_token + VOLUME_TAG = random_pet.volume_tag.id } system_job_count = length({ for k, v in var.workloads : k => v if v.type == "system" }) service_batch_allocs = sum([for wl in var.workloads : wl.alloc_count]) } +# test volume we'll register for the CSI workload +resource "aws_ebs_volume" "test_volume" { + availability_zone = var.availability_zone + size = 10 + + tags = { + VolumeTag = random_pet.volume_tag.id + } +} + resource "enos_local_exec" "wait_for_nomad_api" { environment = local.nomad_env @@ -29,28 +43,38 @@ resource "enos_local_exec" "wait_for_nomad_api" { } resource "enos_local_exec" "get_nodes" { + depends_on = [enos_local_exec.wait_for_nomad_api] environment = local.nomad_env inline = ["nomad node status -json | jq '[.[] | select(.Status == \"ready\")] | length'"] } resource "enos_local_exec" "get_jobs" { + depends_on = [enos_local_exec.wait_for_nomad_api] environment = local.nomad_env inline = ["nomad job status| awk '$4 == \"running\" {count++} END {print count+0}'"] } resource "enos_local_exec" "get_allocs" { + depends_on = [enos_local_exec.wait_for_nomad_api] environment = local.nomad_env inline = ["nomad alloc status -json | jq '[.[] | select(.ClientStatus == \"running\")] | length'"] } resource "enos_local_exec" "workloads" { - depends_on = [enos_local_exec.get_jobs, enos_local_exec.get_allocs] - for_each = var.workloads + depends_on = [ + enos_local_exec.get_jobs, + enos_local_exec.get_allocs, + aws_ebs_volume.test_volume + ] + for_each = var.workloads environment = local.nomad_env - inline = ["nomad job run -var alloc_count=${each.value.alloc_count} ${path.module}/${each.value.job_spec}"] + inline = [ + "nomad job run -var alloc_count=${each.value.alloc_count} ${path.module}/${each.value.job_spec}", + each.value.post_script != null ? abspath("${path.module}/${each.value.post_script}") : "echo ok" + ] } diff --git a/enos/modules/run_workloads/scripts/volume.hcl.tpl b/enos/modules/run_workloads/scripts/volume.hcl.tpl new file mode 100644 index 00000000000..99ee051fb71 --- /dev/null +++ b/enos/modules/run_workloads/scripts/volume.hcl.tpl @@ -0,0 +1,13 @@ +# Copyright (c) HashiCorp, Inc. +# SPDX-License-Identifier: BUSL-1.1 + +type = "csi" +id = "ebsTestVolume" +name = "IDEMPOTENCY_TOKEN" +external_id = "EXTERNAL_ID" +plugin_id = "aws-ebs0" + +capability { + access_mode = "single-node-writer" + attachment_mode = "file-system" +} diff --git a/enos/modules/run_workloads/scripts/wait_for_ebs_plugin.sh b/enos/modules/run_workloads/scripts/wait_for_ebs_plugin.sh new file mode 100755 index 00000000000..7222b7ff6d9 --- /dev/null +++ b/enos/modules/run_workloads/scripts/wait_for_ebs_plugin.sh @@ -0,0 +1,68 @@ +#!/usr/bin/env bash +# Copyright (c) HashiCorp, Inc. +# SPDX-License-Identifier: BUSL-1.1 + +set -euo pipefail +set -x + +# note: it can a very long time for plugins to come up +TIMEOUT=60 +INTERVAL=2 +last_error= +start_time=$(date +%s) + +checkPlugin() { + local pluginStatus foundControllers foundNodes + pluginStatus=$(nomad plugin status aws-ebs0) || { + last_error="could not read CSI plugin status" + return 1 + } + + foundControllers=$(echo "$pluginStatus" | awk -F'= +' '/Controllers Healthy/{print $2}') + if [[ "$foundControllers" != 2 ]]; then + echo "$foundControllers" + last_error="expected plugin to have 2 healthy controllers, found $foundControllers" + return 1 + fi + + foundNodes=$(echo "$pluginStatus" | awk -F'= +' '/Nodes Healthy/{print $2}') + if [[ "$foundNodes" == 0 ]]; then + last_error="expected plugin to have at least 1 healthy nodes, found none" + return 1 + fi + return 0 +} + +registerVolume() { + local externalID idempotencyToken dir + idempotencyToken=$(uuidgen) + dir=$(dirname "${BASH_SOURCE[0]}") + externalID=$(aws ec2 describe-volumes --filters "Name=tag:VolumeTag,Values=$VOLUME_TAG" --output=json | jq -r '.Volumes[0].VolumeId') || { + echo "Could not find volume for $VOLUME_TAG" + exit 1 + } + + sed -e "s/IDEMPOTENCY_TOKEN/$idempotencyToken/" \ + -e "s/EXTERNAL_ID/$externalID/" \ + "${dir}/volume.hcl.tpl" | nomad volume register - || { + echo "Could not register volume" + exit 1 + } +} + +while : +do + checkPlugin && break + + current_time=$(date +%s) + elapsed_time=$((current_time - start_time)) + if [ "$elapsed_time" -ge "$TIMEOUT" ]; then + echo "Error: CSI plugin did not become available within $TIMEOUT seconds." + exit 1 + fi + + sleep "$INTERVAL" +done + +registerVolume +nomad volume status -type csi diff --git a/enos/modules/run_workloads/variables.tf b/enos/modules/run_workloads/variables.tf index 6bd96875ec3..5077121fb39 100644 --- a/enos/modules/run_workloads/variables.tf +++ b/enos/modules/run_workloads/variables.tf @@ -35,5 +35,11 @@ variable "workloads" { job_spec = string alloc_count = number type = string + post_script = optional(string) })) } + +variable "availability_zone" { + description = "The AWS availability zone the cluster was deployed to." + default = "us-east-1b" +}