Skip to content

Commit 5b45d2f

Browse files
francesco-giordanohanwen-pcluste
authored andcommitted
Replace nvidia-persistenced service with parallelcluster_nvidia service to avoid conflicts with DLAMI
parallelcluster_nvidia service ensures the creation of the block devices /dev/nvidia0 and it is needed by the slurmd service. parallelcluster_nvidia starts the nvidia-persistenced or run nvidia-smi to avoid race condition with other services. Signed-off-by: Francesco Giordano <[email protected]>
1 parent a5d6e8d commit 5b45d2f

File tree

7 files changed

+53
-14
lines changed

7 files changed

+53
-14
lines changed

cookbooks/aws-parallelcluster-config/recipes/nvidia.rb

Lines changed: 17 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -35,14 +35,23 @@
3535
group 'root'
3636
mode '0644'
3737
end
38-
# Install nvidia_persistenced. See https://download.nvidia.com/XFree86/Linux-x86_64/396.51/README/nvidia-persistenced.html
39-
bash 'Install nvidia_persistenced' do
40-
cwd '/usr/share/doc/NVIDIA_GLX-1.0/samples'
41-
user 'root'
38+
39+
# Install ParallelCluster nvidia service.
40+
# The service ensures the creation of the block devices /dev/nvidia0 after reboot and it is needed by the slurmd service
41+
# cookbooks/aws-parallelcluster-slurm/templates/default/slurm/compute/slurmd_nvidia_persistenced.conf.erb
42+
#
43+
# The service starts the nvidia-persistenced or run nvidia-smi to avoid race condition with other services
44+
template '/etc/systemd/system/parallelcluster_nvidia.service' do
45+
source 'nvidia/parallelcluster_nvidia_service.erb'
46+
owner 'root'
4247
group 'root'
43-
code <<-NVIDIA
44-
tar -xf nvidia-persistenced-init.tar.bz2
45-
./nvidia-persistenced-init/install.sh
46-
NVIDIA
48+
mode '0644'
49+
action :create
50+
variables(is_nvidia_persistenced_running: is_process_running('nvidia-persistenced'))
51+
end
52+
53+
service "parallelcluster_nvidia" do
54+
supports restart: false
55+
action %i(enable start)
4756
end
4857
end
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
# This systemd service file, designed to trigger the creation device block file /dev/nvidia0
2+
# The service start nvidia-persistenced if it is not already started or execute the command nvidia-smi.
3+
4+
[Unit]
5+
Description=ParallelCluster NVIDIA Daemon
6+
Wants=syslog.target
7+
8+
[Service]
9+
<% if @is_nvidia_persistenced_running -%>
10+
Type=simple
11+
ExecStart=/usr/bin/nvidia-smi
12+
RemainAfterExit=yes
13+
<% else %>
14+
Type=forking
15+
ExecStart=/usr/bin/nvidia-persistenced --user root
16+
ExecStopPost=/bin/rm -rf /var/run/nvidia-persistenced
17+
<% end %>
18+
19+
[Install]
20+
WantedBy=multi-user.target

cookbooks/aws-parallelcluster-slurm/recipes/config_slurmd_systemd_service.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
action :create
2525
end
2626

27-
# Add systemd dependency between slurmd and nvidia-persistenced for NVIDIA GPU nodes
27+
# Add systemd dependency between slurmd and parallelcluster_nvidia for NVIDIA GPU nodes
2828
if graphic_instance? && nvidia_installed?
2929
directory '/etc/systemd/system/slurmd.service.d' do
3030
user 'root'
Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
[Unit]
2-
After=nvidia-persistenced.service
3-
Wants=nvidia-persistenced.service
2+
After=parallelcluster_nvidia.service
3+
Wants=parallelcluster_nvidia.service

libraries/helpers.rb

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,16 @@ def ignore_failure(lookup)
129129
end
130130
end
131131

132+
#
133+
# Check if a process is running
134+
#
135+
def is_process_running(process_name)
136+
ps = Mixlib::ShellOut.new("ps aux | grep '#{process_name}' | egrep -v \"grep .*#{process_name}\"")
137+
ps.run_command
138+
139+
!ps.stdout.strip.empty?
140+
end
141+
132142
#
133143
# Check if the instance has a GPU
134144
#

test/recipes/controls/aws_parallelcluster_config/nvidia_spec.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@
5555
its('content') { should include("uvm") }
5656
end
5757

58-
describe service('nvidia-persistenced') do
58+
describe service('parallelcluster_nvidia') do
5959
it { should be_enabled }
6060
it { should be_running }
6161
end

test/recipes/controls/aws_parallelcluster_slurm/config_slurmd_systemd_service_spec.rb

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,10 +28,10 @@
2828

2929
describe 'Check slurmd systemd "after" dependencies'
3030
describe command('systemctl list-dependencies --after --plain slurmd.service') do
31-
its('stdout') { should include "nvidia-persistenced.service" }
31+
its('stdout') { should include "parallelcluster_nvidia.service" }
3232
end
3333
describe 'Check slurmd systemd requirement dependencies'
3434
describe command('systemctl list-dependencies --plain slurmd.service') do
35-
its('stdout') { should include "nvidia-persistenced.service" }
35+
its('stdout') { should include "parallelcluster_nvidia.service" }
3636
end
3737
end

0 commit comments

Comments
 (0)