File tree 7 files changed +53
-14
lines changed
aws-parallelcluster-config
aws-parallelcluster-slurm
templates/default/slurm/compute
aws_parallelcluster_config
aws_parallelcluster_slurm 7 files changed +53
-14
lines changed Original file line number Diff line number Diff line change 35
35
group 'root'
36
36
mode '0644'
37
37
end
38
- # Install nvidia_persistenced. See https://download.nvidia.com/XFree86/Linux-x86_64/396.51/README/nvidia-persistenced.html
39
- bash 'Install nvidia_persistenced' do
40
- cwd '/usr/share/doc/NVIDIA_GLX-1.0/samples'
41
- user 'root'
38
+
39
+ # Install ParallelCluster nvidia service.
40
+ # The service ensures the creation of the block devices /dev/nvidia0 after reboot and it is needed by the slurmd service
41
+ # cookbooks/aws-parallelcluster-slurm/templates/default/slurm/compute/slurmd_nvidia_persistenced.conf.erb
42
+ #
43
+ # The service starts the nvidia-persistenced or run nvidia-smi to avoid race condition with other services
44
+ template '/etc/systemd/system/parallelcluster_nvidia.service' do
45
+ source 'nvidia/parallelcluster_nvidia_service.erb'
46
+ owner 'root'
42
47
group 'root'
43
- code <<-NVIDIA
44
- tar -xf nvidia-persistenced-init.tar.bz2
45
- ./nvidia-persistenced-init/install.sh
46
- NVIDIA
48
+ mode '0644'
49
+ action :create
50
+ variables ( is_nvidia_persistenced_running : is_process_running ( 'nvidia-persistenced' ) )
51
+ end
52
+
53
+ service "parallelcluster_nvidia" do
54
+ supports restart : false
55
+ action %i( enable start )
47
56
end
48
57
end
Original file line number Diff line number Diff line change
1
+ # This systemd service file, designed to trigger the creation device block file /dev/nvidia0
2
+ # The service start nvidia-persistenced if it is not already started or execute the command nvidia-smi.
3
+
4
+ [Unit]
5
+ Description=ParallelCluster NVIDIA Daemon
6
+ Wants=syslog.target
7
+
8
+ [Service]
9
+ <% if @is_nvidia_persistenced_running -%>
10
+ Type=simple
11
+ ExecStart=/usr/bin/nvidia-smi
12
+ RemainAfterExit=yes
13
+ <% else %>
14
+ Type=forking
15
+ ExecStart=/usr/bin/nvidia-persistenced --user root
16
+ ExecStopPost=/bin/rm -rf /var/run/nvidia-persistenced
17
+ <% end %>
18
+
19
+ [Install]
20
+ WantedBy=multi-user.target
Original file line number Diff line number Diff line change 24
24
action :create
25
25
end
26
26
27
- # Add systemd dependency between slurmd and nvidia-persistenced for NVIDIA GPU nodes
27
+ # Add systemd dependency between slurmd and parallelcluster_nvidia for NVIDIA GPU nodes
28
28
if graphic_instance? && nvidia_installed?
29
29
directory '/etc/systemd/system/slurmd.service.d' do
30
30
user 'root'
Original file line number Diff line number Diff line change 1
1
[Unit]
2
- After=nvidia-persistenced .service
3
- Wants=nvidia-persistenced .service
2
+ After=parallelcluster_nvidia .service
3
+ Wants=parallelcluster_nvidia .service
Original file line number Diff line number Diff line change @@ -129,6 +129,16 @@ def ignore_failure(lookup)
129
129
end
130
130
end
131
131
132
+ #
133
+ # Check if a process is running
134
+ #
135
+ def is_process_running ( process_name )
136
+ ps = Mixlib ::ShellOut . new ( "ps aux | grep '#{ process_name } ' | egrep -v \" grep .*#{ process_name } \" " )
137
+ ps . run_command
138
+
139
+ !ps . stdout . strip . empty?
140
+ end
141
+
132
142
#
133
143
# Check if the instance has a GPU
134
144
#
Original file line number Diff line number Diff line change 55
55
its ( 'content' ) { should include ( "uvm" ) }
56
56
end
57
57
58
- describe service ( 'nvidia-persistenced ' ) do
58
+ describe service ( 'parallelcluster_nvidia ' ) do
59
59
it { should be_enabled }
60
60
it { should be_running }
61
61
end
Original file line number Diff line number Diff line change 28
28
29
29
describe 'Check slurmd systemd "after" dependencies'
30
30
describe command ( 'systemctl list-dependencies --after --plain slurmd.service' ) do
31
- its ( 'stdout' ) { should include "nvidia-persistenced .service" }
31
+ its ( 'stdout' ) { should include "parallelcluster_nvidia .service" }
32
32
end
33
33
describe 'Check slurmd systemd requirement dependencies'
34
34
describe command ( 'systemctl list-dependencies --plain slurmd.service' ) do
35
- its ( 'stdout' ) { should include "nvidia-persistenced .service" }
35
+ its ( 'stdout' ) { should include "parallelcluster_nvidia .service" }
36
36
end
37
37
end
You can’t perform that action at this time.
0 commit comments