From 90ec5ea8039ff301b4a021a5ac2074686a5019f8 Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande Date: Fri, 23 May 2025 21:33:27 -0400 Subject: [PATCH 1/2] Add TOTAL_MIN_COUNT of a cluster as comment * Run static fleet checks if there are any static nodes --- .../slurm/templates/slurm_parallelcluster.conf | 4 ++++ cookbooks/aws-parallelcluster-slurm/libraries/helpers.rb | 7 +++++++ 2 files changed, 11 insertions(+) diff --git a/cookbooks/aws-parallelcluster-slurm/files/default/head_node_slurm/slurm/templates/slurm_parallelcluster.conf b/cookbooks/aws-parallelcluster-slurm/files/default/head_node_slurm/slurm/templates/slurm_parallelcluster.conf index 07c383ff79..935c9e7890 100644 --- a/cookbooks/aws-parallelcluster-slurm/files/default/head_node_slurm/slurm/templates/slurm_parallelcluster.conf +++ b/cookbooks/aws-parallelcluster-slurm/files/default/head_node_slurm/slurm/templates/slurm_parallelcluster.conf @@ -36,6 +36,7 @@ include {{ output_dir }}/pcluster/slurm_parallelcluster_{{ queue.Name }}_partiti {% endfor %} {% if ns.has_static %} +{%- set ns.total_min_count = 0 %} SuspendExcNodes= {%- set ns.is_first = True %} {%- for queue in queues %} @@ -43,9 +44,12 @@ SuspendExcNodes= {% if compute_resource.MinCount > 0 %} {{- "," if not ns.is_first else "" -}} {{ queue.Name }}-st-{{ compute_resource.Name }}-[1-{{ compute_resource.MinCount }}] + {%- set ns.total_min_count = ns.total_min_count + compute_resource.MinCount %} {%- set ns.is_first = False %} {%- endif %} {% endfor %} {% endfor %} {% endif %} + +#TOTAL_MIN_COUNT={{ ns.total_min_count }} \ No newline at end of file diff --git a/cookbooks/aws-parallelcluster-slurm/libraries/helpers.rb b/cookbooks/aws-parallelcluster-slurm/libraries/helpers.rb index dc599d91a6..752e04f5f0 100644 --- a/cookbooks/aws-parallelcluster-slurm/libraries/helpers.rb +++ b/cookbooks/aws-parallelcluster-slurm/libraries/helpers.rb @@ -178,6 +178,11 @@ def wait_cluster_ready end end +def get_static_node_count + cmd = Mixlib::ShellOut.new("cat #{node['cluster']['slurm']['install_dir']}/etc/slurm_parallelcluster.conf | grep -o '#TOTAL_MIN_COUNT=\([0-9]*\)' | cut -d'=' -f2") + cmd.run_command.stdout.strip +end + def wait_static_fleet_running ruby_block "wait for static fleet capacity" do block do @@ -203,6 +208,7 @@ def check_for_protected_mode(fleet_status_command) # rubocop:disable Lint/Nested fleet_status_command = Shellwords.escape( "/usr/local/bin/get-compute-fleet-status.sh" ) + # Example output for sinfo # sinfo -h -o '%N %t' # queue-0-dy-compute-resource-g4dn-0-[1-10],queue-1-dy-compute-resource-g4dn-1-[1-10] idle~ @@ -215,6 +221,7 @@ def check_for_protected_mode(fleet_status_command) # rubocop:disable Lint/Nested end Chef::Log.info("Static fleet capacity is ready") end + only_if { get_static_node_count.to_i > 0 } end end From 9674576483c77250aeb4acf8e602a84612b3f94d Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande Date: Fri, 23 May 2025 22:07:17 -0400 Subject: [PATCH 2/2] Adding it as condition --- .../compute_fleet_status/is_fleet_ready.erb | 19 ++++++++++++++----- .../libraries/helpers.rb | 5 ++--- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/cookbooks/aws-parallelcluster-computefleet/templates/compute_fleet_status/is_fleet_ready.erb b/cookbooks/aws-parallelcluster-computefleet/templates/compute_fleet_status/is_fleet_ready.erb index 0cf4d4a05e..305931d092 100644 --- a/cookbooks/aws-parallelcluster-computefleet/templates/compute_fleet_status/is_fleet_ready.erb +++ b/cookbooks/aws-parallelcluster-computefleet/templates/compute_fleet_status/is_fleet_ready.erb @@ -1,7 +1,16 @@ #!/bin/bash -sinfo_output=$(<%= node['cluster']['slurm']['install_dir'] %>/bin/sinfo -h -o '%N %t' | grep -v -E '(idle|alloc|mix|maint)$') -while IFS= read -r line; do - nodelist=$(echo "$line" | awk '{print $1}') - <%= node['cluster']['slurm']['install_dir'] %>/bin/scontrol show hostnames "$nodelist" | { grep -E '^[a-z0-9\-]+\-st\-[a-z0-9\-]+\-[0-9]+.*' || true; } -done <<< "$sinfo_output" + + +cluster_static_node_count=$1 +if [[ -z "$cluster_static_node_count" ]]; then + cluster_static_node_count=1 +fi + +if [[ "$cluster_static_node_count" -ge "1" ]]; then + sinfo_output=$(<%= node['cluster']['slurm']['install_dir'] %>/bin/sinfo -h -o '%N %t' | grep -v -E '(idle|alloc|mix|maint)$') + while IFS= read -r line; do + nodelist=$(echo "$line" | awk '{print $1}') + <%= node['cluster']['slurm']['install_dir'] %>/bin/scontrol show hostnames "$nodelist" | { grep -E '^[a-z0-9\-]+\-st\-[a-z0-9\-]+\-[0-9]+.*' || true; } + done <<< "$sinfo_output" +fi \ No newline at end of file diff --git a/cookbooks/aws-parallelcluster-slurm/libraries/helpers.rb b/cookbooks/aws-parallelcluster-slurm/libraries/helpers.rb index 752e04f5f0..96e4543b98 100644 --- a/cookbooks/aws-parallelcluster-slurm/libraries/helpers.rb +++ b/cookbooks/aws-parallelcluster-slurm/libraries/helpers.rb @@ -213,15 +213,14 @@ def check_for_protected_mode(fleet_status_command) # rubocop:disable Lint/Nested # sinfo -h -o '%N %t' # queue-0-dy-compute-resource-g4dn-0-[1-10],queue-1-dy-compute-resource-g4dn-1-[1-10] idle~ # queue-2-dy-compute-resource-g4dn-2-[1-10],queue-3-dy-compute-resource-g4dn-3-[1-10] idle - until shell_out!("/bin/bash -c /usr/local/bin/is_fleet_ready.sh").stdout.strip.empty? - check_for_protected_mode(fleet_status_command) + until shell_out!("/bin/bash -c /usr/local/bin/is_fleet_ready.sh #{get_static_node_count.to_i}").stdout.strip.empty? + check_for_protected_mode(fleet_status_command) #TODO Separate check for dynamic Nodes during dfsmv2 Chef::Log.info("Waiting for static fleet capacity provisioning") sleep(15) end Chef::Log.info("Static fleet capacity is ready") end - only_if { get_static_node_count.to_i > 0 } end end