Skip to content

Commit

Permalink
upgrade tests: add CSI workload
Browse files Browse the repository at this point in the history
Add an upgrade test workload for CSI with the AWS EBS plugin. In order to
validate this workload, we'll need to deploy the plugin jobs and then register
volumes with them. So this extends the `run_workloads` module to allow for a
"post script" to be run after a given job has been deployed. We can use that as
a model for other test workloads.

Ref: https://hashicorp.atlassian.net/browse/NET-12217
  • Loading branch information
tgross committed Feb 26, 2025
1 parent b131320 commit a9ac909
Show file tree
Hide file tree
Showing 8 changed files with 292 additions and 6 deletions.
27 changes: 22 additions & 5 deletions enos/enos-scenario-upgrade.hcl
Original file line number Diff line number Diff line change
Expand Up @@ -89,18 +89,35 @@ scenario "upgrade" {

module = module.run_workloads
variables {
nomad_addr = step.provision_cluster.nomad_addr
ca_file = step.provision_cluster.ca_file
cert_file = step.provision_cluster.cert_file
key_file = step.provision_cluster.key_file
nomad_token = step.provision_cluster.nomad_token
nomad_addr = step.provision_cluster.nomad_addr
ca_file = step.provision_cluster.ca_file
cert_file = step.provision_cluster.cert_file
key_file = step.provision_cluster.key_file
nomad_token = step.provision_cluster.nomad_token
cluster_name = local.cluster_name

workloads = {
# deploy these first
csi_plugin_ebs_controller = {
job_spec = "jobs/plugin-aws-ebs-controller.nomad.hcl"
alloc_count = 2
type = "service"
}
csi_plugin_ebs_node = {
job_spec = "jobs/plugin-aws-ebs-controller.nomad.hcl"
alloc_count = 0
type = "system"
post_script = "scripts/wait_for_ebs_plugin.sh"
}

service_raw_exec = { job_spec = "jobs/raw-exec-service.nomad.hcl", alloc_count = 3, type = "service" }
service_docker = { job_spec = "jobs/docker-service.nomad.hcl", alloc_count = 3, type = "service" }
system_docker = { job_spec = "jobs/docker-system.nomad.hcl", alloc_count = 0, type = "system" }
batch_docker = { job_spec = "jobs/docker-batch.nomad.hcl", alloc_count = 3, type = "batch" }
batch_raw_exec = { job_spec = "jobs/raw-exec-batch.nomad.hcl", alloc_count = 3, type = "batch" }
system_raw_exec = { job_spec = "jobs/raw-exec-system.nomad.hcl", alloc_count = 0, type = "system" }
wants_csi = { job_spec = "jobs/wants-volume.nomad.hcl", alloc_count = 2, type = "service" }

}
}

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
# Copyright (c) HashiCorp, Inc.
# SPDX-License-Identifier: BUSL-1.1
variable "alloc_count" {
type = number
default = 2
}

job "plugin-aws-ebs-controller" {

constraint {
attribute = "${attr.kernel.name}"
value = "linux"
}

group "controller" {

count = var.alloc_count

task "plugin" {
driver = "docker"

config {
image = "public.ecr.aws/ebs-csi-driver/aws-ebs-csi-driver:v1.33.0"

args = [
"controller",
"--endpoint=${CSI_ENDPOINT}",
"--logtostderr",
"--v=5",
]
}

csi_plugin {
id = "aws-ebs0"
type = "controller"
mount_dir = "/csi"
}

resources {
cpu = 100
memory = 256
}
}
}
}
42 changes: 42 additions & 0 deletions enos/modules/run_workloads/jobs/plugin-aws-ebs-nodes.nomad.hcl
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
# Copyright (c) HashiCorp, Inc.
# SPDX-License-Identifier: BUSL-1.1

job "plugin-aws-ebs-nodes" {

constraint {
attribute = "${attr.kernel.name}"
value = "linux"
}

type = "system"

group "nodes" {
task "plugin" {
driver = "docker"

config {
image = "public.ecr.aws/ebs-csi-driver/aws-ebs-csi-driver:v1.33.0"

args = [
"node",
"--endpoint=${CSI_ENDPOINT}",
"--logtostderr",
"--v=5",
]

privileged = true
}

csi_plugin {
id = "aws-ebs0"
type = "node"
mount_dir = "/csi"
}

resources {
cpu = 100
memory = 256
}
}
}
}
75 changes: 75 additions & 0 deletions enos/modules/run_workloads/jobs/wants-volume.nomad.hcl
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
# Copyright (c) HashiCorp, Inc.
# SPDX-License-Identifier: BUSL-1.1

variable "alloc_count" {
type = number
default = 1
}

# a job that mounts an EBS volume and writes its job ID as a file
job "wants-ebs-volume" {

constraint {
attribute = "${attr.kernel.name}"
value = "linux"
}

group "group" {
count = var.alloc_count

volume "test" {
type = "csi"
source = "ebsTestVolume"
attachment_mode = "file-system"
access_mode = "single-node-writer"
}

task "task" {
driver = "docker"

config {
image = "busybox:1"
command = "httpd"
args = ["-vv", "-f", "-p", "8001", "-h", "/local"]
}

volume_mount {
volume = "test"
destination = "${NOMAD_TASK_DIR}/test"
read_only = false
}

resources {
cpu = 100
memory = 64
}
}

task "sidecar" {
driver = "docker"

config {
image = "busybox:1"
command = "/bin/sh"
args = ["-c", "echo '${NOMAD_ALLOC_ID}' > ${NOMAD_TASK_DIR}/index.html"]
}

lifecycle {
hook = "poststart"
sidecar = false
}

volume_mount {
volume = "test"
destination = "${NOMAD_TASK_DIR}/test"
read_only = false
}

resources {
cpu = 10
memory = 10
}

}
}
}
24 changes: 23 additions & 1 deletion enos/modules/run_workloads/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,31 @@ locals {
NOMAD_CLIENT_CERT = var.cert_file
NOMAD_CLIENT_KEY = var.key_file
NOMAD_TOKEN = var.nomad_token
CLUSTER_NAME = var.cluster_name
}

system_job_count = length({ for k, v in var.workloads : k => v if v.type == "system" })
service_batch_allocs = sum([for wl in var.workloads : wl.alloc_count])
}

# need to get the zone for one of the instances
data "aws_instance" "server0" {
filter {
name = "tag:Name"
values = ["${var.cluster_name}-server0"]
}
}

# test volume we'll register for the CSI workload
resource "aws_ebs_volume" "test_volume" {
availability_zone = data.aws_instance.server0.availability_zone
size = 10

tags = {
Cluster = "${var.cluster_name}"
}
}

resource "enos_local_exec" "wait_for_nomad_api" {
environment = local.nomad_env

Expand Down Expand Up @@ -52,5 +71,8 @@ resource "enos_local_exec" "workloads" {

environment = local.nomad_env

inline = ["nomad job run -var alloc_count=${each.value.alloc_count} ${path.module}/${each.value.job_spec}"]
inline = [
"nomad job run -var alloc_count=${each.value.alloc_count} ${path.module}/${each.value.job_spec}",
each.value.post_script != "" ? each.value.post_script : "echo ok"
]
}
13 changes: 13 additions & 0 deletions enos/modules/run_workloads/scripts/volume.hcl.tpl
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# Copyright (c) HashiCorp, Inc.
# SPDX-License-Identifier: BUSL-1.1

type = "csi"
id = "ebsTestVolume"
name = "IDEMPOTENCY_TOKEN"
external_id = "EXTERNAL_ID"
plugin_id = "aws-ebs0"

capability {
access_mode = "single-node-writer"
attachment_mode = "file-system"
}
66 changes: 66 additions & 0 deletions enos/modules/run_workloads/scripts/wait_for_ebs_plugin.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
#!/usr/bin/env bash
# Copyright (c) HashiCorp, Inc.
# SPDX-License-Identifier: BUSL-1.1

set -euo pipefail

# note: it can a very long time for plugins to come up
TIMEOUT=60
INTERVAL=2
last_error=
start_time=$(date +%s)

checkPlugin() {
local pluginStatus foundControllers foundNodes
pluginStatus=$(nomad plugin status aws-ebs0) || {
last_error="could not read CSI plugin status"
return 1
}

foundControllers=$(echo "$pluginStatus" | awk -F'= +' '/Controllers Healthy/{print $2}')
if [[ "$foundControllers" != 2 ]]; then
echo "$foundControllers"
last_error="expected plugin to have 2 healthy controllers, found $foundControllers"
return 1
fi

foundNodes=$(echo "$pluginStatus" | awk -F'= +' '/Nodes Healthy/{print $2}')
if [[ "$foundNodes" == 0 ]]; then
last_error="expected plugin to have at least 1 healthy nodes, found none"
return 1
fi
return 0
}

registerVolume() {
local externalID idempotencyToken
idempotencyToken=$(uuidgen)

externalID=$(aws ec2 describe-volumes
--filters "Name=tag:Cluster,Values=$CLUSTER_NAME"
--output=json | jq '.Volumes[0].VolumeId') || {
echo "Could not find volume for $CLUSTER_NAME"
exit 1
}

sed -e "s/IDEMPOTENCY_TOKEN/$idempotencyToken/" \
-e "s/EXTERNAL_ID/$externalID/" \
volume.hcl.tpl | nomad volume register -
}

while :
do
checkPlugin && break

current_time=$(date +%s)
elapsed_time=$((current_time - start_time))
if [ "$elapsed_time" -ge "$TIMEOUT" ]; then
echo "Error: CSI plugin did not become available within $TIMEOUT seconds."
exit 1
fi

sleep "$INTERVAL"
done

registerVolume
nomad volume status -type csi
6 changes: 6 additions & 0 deletions enos/modules/run_workloads/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -35,5 +35,11 @@ variable "workloads" {
job_spec = string
alloc_count = number
type = string
post_script = optional(string)
}))
}

variable "cluster_name" {
description = "The name of the cluster, which we need to find the AZ"
type = string
}

0 comments on commit a9ac909

Please sign in to comment.