File tree Expand file tree Collapse file tree 3 files changed +10
-4
lines changed
aws-parallelcluster-platform
resources/fabric_manager/partial
aws-parallelcluster-slurm Expand file tree Collapse file tree 3 files changed +10
-4
lines changed Original file line number Diff line number Diff line change @@ -78,12 +78,14 @@ suites:
78
78
- ' resource:package { "package_name": "dkms" }'
79
79
- resource:build_tools
80
80
- recipe:aws-parallelcluster-platform::nvidia_install
81
+ # - resource:fabric_manager:configure # Needed for Multi-gpu instance like p5.48xlarge
81
82
resource : gdrcopy:configure
82
83
cluster :
83
84
nvidia :
84
85
enabled : true
85
86
driver :
86
87
instance_type : g4dn.2xlarge
88
+ # instance_type: p5.48xlarge
87
89
- name : intel_hpc
88
90
run_list :
89
91
- recipe[aws-parallelcluster-tests::setup]
Original file line number Diff line number Diff line change @@ -63,8 +63,10 @@ def _nvidia_driver_version
63
63
64
64
# Get number of nv switches
65
65
def get_nvswitches
66
- # NVSwitch device id is 10de:1af1
67
- nvswitch_check = Mixlib ::ShellOut . new ( "lspci -d 10de:1af1 | wc -l" )
68
- nvswitch_check . run_command
69
- nvswitch_check . stdout . strip . to_i
66
+ # A100 (P4) and H100(P5) systems have NVSwitches
67
+ # NVSwitch device id is 10de:1af1 for P4 instance
68
+ # NVSwitch device id is 10de:22a3 for P5 instance
69
+ nvswitch_check_p4 = shell_out ( "lspci -d 10de:1af1 | wc -l" )
70
+ nvswitch_check_p5 = shell_out ( "lspci -d 10de:22a3 | wc -l" )
71
+ nvswitch_check_p4 . stdout . strip . to_i + nvswitch_check_p5 . stdout . strip . to_i
70
72
end
Original file line number Diff line number Diff line change @@ -84,10 +84,12 @@ suites:
84
84
- /gpu_health_check_execution/
85
85
driver :
86
86
instance_type : g4dn.xlarge
87
+ # instance_type: p5.48xlarge
87
88
attributes :
88
89
dependencies :
89
90
- recipe:aws-parallelcluster-slurm::mock_slurm
90
91
- resource:node_attributes
92
+ # - resource:fabric_manager:configure # Needed for Multi-gpu instance like p5.48xlarge
91
93
cluster :
92
94
node_type : HeadNode
93
95
scheduler : ' slurm'
You can’t perform that action at this time.
0 commit comments