-
Notifications
You must be signed in to change notification settings - Fork 2
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Run 790 add gke installation script #14
base: main
Are you sure you want to change the base?
Changes from all commits
3b15096
96b160c
96a7d94
f7fe09e
3e1f662
a535dc3
1a4e799
bbb532a
865863b
43a30d3
0e04dac
d7aa51f
b462042
35a045e
a2e6fa9
73ed8da
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
image: | ||
repository: nvcr.io/nvidia/k8s/dcgm-exporter | ||
pullPolicy: IfNotPresent | ||
tag: 2.3.5-2.6.5-ubuntu20.04 | ||
|
||
arguments: ["--kubernetes-gpu-id-type", "device-name"] | ||
|
||
securityContext: | ||
runAsNonRoot: false | ||
runAsUser: 0 | ||
capabilities: | ||
add: ["SYS_ADMIN"] | ||
privileged: true | ||
|
||
serviceMonitor: | ||
enabled: false | ||
interval: 15s | ||
additionalLabels: {} | ||
|
||
nodeSelector: | ||
feature.node.kubernetes.io/pci-10de.present: "true" | ||
|
||
tolerations: | ||
- effect: NoSchedule | ||
key: nvidia.com/gpu | ||
operator: Exists |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,272 @@ | ||
import os | ||
import sys | ||
import json | ||
import subprocess | ||
|
||
|
||
DEBUG = True | ||
GPU_TOLERATION = {'effect': 'NoSchedule', 'key': 'nvidia.com/gpu', 'operator': 'Exists'} | ||
|
||
DCGM_EXPORTER_VALUES_YAML = """ | ||
image: | ||
repository: nvcr.io/nvidia/k8s/dcgm-exporter | ||
pullPolicy: IfNotPresent | ||
tag: 2.3.5-2.6.5-ubuntu20.04 | ||
|
||
arguments: ["--kubernetes-gpu-id-type", "device-name"] | ||
|
||
securityContext: | ||
runAsNonRoot: false | ||
runAsUser: 0 | ||
capabilities: | ||
add: ["SYS_ADMIN"] | ||
privileged: true | ||
|
||
serviceMonitor: | ||
enabled: false | ||
interval: 15s | ||
additionalLabels: {} | ||
|
||
nodeSelector: | ||
feature.node.kubernetes.io/pci-10de.present: "true" | ||
|
||
tolerations: | ||
- effect: NoSchedule | ||
key: nvidia.com/gpu | ||
operator: Exists | ||
""" | ||
|
||
|
||
class PatchingDs(): | ||
def __init__(self, ds_name): | ||
self._name = ds_name | ||
self._should_edit = True | ||
|
||
def _get_json(self): | ||
debug_print('Getting {} json'.format(self._name)) | ||
json_output = exec_command(self._get_json_command) | ||
return json.loads(json_output) | ||
|
||
def _pre_patch(self): | ||
return | ||
|
||
def patch(self): | ||
if not self._should_edit: | ||
return | ||
|
||
self._pre_patch() | ||
|
||
ds_json = self._get_json() | ||
self.edit_ds_json(ds_json) | ||
debug_print('Applying edited {}'.format(self._name)) | ||
apply_json(ds_json) | ||
|
||
def edit_ds_json(self, ds_json): | ||
raise NotImplementedError() | ||
|
||
|
||
class Gfd(PatchingDs): | ||
def __init__(self, version): | ||
PatchingDs.__init__(self, 'gpu-feature-discovery') | ||
ds_name = self._get_gfd_ds_name(version) | ||
self._get_json_command = 'kubectl get ds {} -n node-feature-discovery -ojson'.format(ds_name) | ||
|
||
def _get_gfd_ds_name(self, version): | ||
if version == 2.4: | ||
return 'runai-cluster-gpu-feature-discovery' | ||
if version >= 2.5: | ||
return 'gpu-feature-discovery' | ||
return '' | ||
|
||
def edit_ds_json(self, ds_json): | ||
add_nvidia_volumes_if_needed(ds_json) | ||
remove_priority_class(ds_json) | ||
add_gpu_toleration_if_needed(ds_json) | ||
|
||
|
||
class Nfd(PatchingDs): | ||
def __init__(self, version): | ||
PatchingDs.__init__(self, 'node-feature-discovery') | ||
|
||
if version < 2.5: | ||
debug_print('No need to edit nfd - version: {}'.format(version)) | ||
self._should_edit = False | ||
return | ||
|
||
self._get_json_command = 'kubectl get ds nfd-worker -n node-feature-discovery -ojson' | ||
|
||
def edit_ds_json(self, ds_json): | ||
add_gpu_toleration_if_needed(ds_json) | ||
|
||
|
||
class DcgmExporter(PatchingDs): | ||
def __init__(self, dcgm_exporter_namespace): | ||
PatchingDs.__init__(self, 'dcgm-exporter') | ||
self._dcgm_exporter_namespace = dcgm_exporter_namespace | ||
self._get_json_command = 'kubectl get ds dcgm-exporter -n {} -ojson'.format(self._dcgm_exporter_namespace) | ||
|
||
def _pre_patch(self): | ||
debug_print('Installing dcgm-exporter (if needed)') | ||
|
||
dcgm_exporter_values_filepath = 'dcgm-exporter-values-temp.yaml' | ||
write_to_file(DCGM_EXPORTER_VALUES_YAML, dcgm_exporter_values_filepath) | ||
|
||
install_dcgm_exporter_commands = [ | ||
'helm repo add gpu-helm-charts https://nvidia.github.io/gpu-monitoring-tools/helm-charts', | ||
'helm repo update', | ||
'helm install -f {} dcgm-exporter gpu-helm-charts/dcgm-exporter -n {}'.format(dcgm_exporter_values_filepath, self._dcgm_exporter_namespace) | ||
] | ||
|
||
for command in install_dcgm_exporter_commands: | ||
exec_command(command) | ||
|
||
os.remove(dcgm_exporter_values_filepath) | ||
|
||
def edit_ds_json(self, ds_json): | ||
add_nvidia_volumes_if_needed(ds_json) | ||
edit_probes(ds_json) | ||
|
||
|
||
################ General Functions ################ | ||
def debug_print(str_to_print): | ||
if DEBUG: | ||
print(str_to_print) | ||
|
||
def write_to_file(file_content, file_path): | ||
with open(file_path, 'w') as f: | ||
f.write(file_content) | ||
|
||
def exec_command(command): | ||
output = subprocess.run(command.split(), stdout=subprocess.PIPE) | ||
return str(output.stdout, 'utf-8') if output is not None else "" | ||
|
||
def exec_string_command(string_command): | ||
output = subprocess.run(string_command, stdout=subprocess.PIPE, shell=True) | ||
return str(output.stdout, 'utf-8') if output is not None else "" | ||
|
||
def apply_json(json_content): | ||
json_filepath = '/tmp/json_to_deploy.json' | ||
write_to_file(json.dumps(json_content), json_filepath) | ||
|
||
apply_json_command = 'kubectl apply -f {}'.format(json_filepath) | ||
exec_command(apply_json_command) | ||
|
||
os.remove(json_filepath) | ||
|
||
################ DS editing ################ | ||
def add_nvidia_volumes(ds_json): | ||
debug_print('Adding nvidia volume to ds') | ||
|
||
volumes = ds_json['spec']['template']['spec'].get('volumes') | ||
if not volumes: | ||
ds_json['spec']['template']['spec']['volumes'] = [] | ||
|
||
volumeMounts = ds_json['spec']['template']['spec']['containers'][0].get('volumeMounts') | ||
if not volumeMounts: | ||
ds_json['spec']['template']['spec']['containers'][0]['volumeMounts'] = [] | ||
|
||
nvidia_volume = {'hostPath': {'path': '/home/kubernetes/bin/nvidia', 'type': 'Directory'}, 'name': 'nvidia-volume'} | ||
nvidia_volume_mount = {'mountPath': '/usr/local/nvidia', 'name': 'nvidia-volume'} | ||
|
||
ds_json['spec']['template']['spec']['volumes'].append(nvidia_volume) | ||
ds_json['spec']['template']['spec']['containers'][0]['volumeMounts'].append(nvidia_volume_mount) | ||
|
||
def add_nvidia_volumes_if_needed(ds_json): | ||
is_nvidia_volume_found = False | ||
volumes = ds_json['spec']['template']['spec'].get('volumes') | ||
if volumes: | ||
for volume in volumes: | ||
if volume['hostPath']['path'] == '/home/kubernetes/bin/nvidia': | ||
is_nvidia_volume_found = True | ||
break | ||
|
||
if is_nvidia_volume_found: | ||
debug_print('Nvidia volume already found in ds') | ||
return | ||
|
||
add_nvidia_volumes(ds_json) | ||
|
||
def remove_priority_class(ds_json): | ||
priorityClass = ds_json['spec']['template']['spec'].get('priorityClassName') | ||
if not priorityClass: | ||
debug_print('priorityClassName not found in ds - nothing to remove') | ||
return | ||
|
||
debug_print('Removing priorityClassName from ds') | ||
ds_json['spec']['template']['spec']['priorityClassName'] = None | ||
|
||
def add_gpu_toleration(ds_json): | ||
debug_print('Adding gpu toleration to ds') | ||
|
||
tolerations = ds_json['spec']['template']['spec'].get('tolerations') | ||
if not tolerations: | ||
ds_json['spec']['template']['spec']['tolerations'] = [] | ||
|
||
ds_json['spec']['template']['spec']['tolerations'].append(GPU_TOLERATION) | ||
|
||
def add_gpu_toleration_if_needed(ds_json): | ||
is_gpu_toleration_found = False | ||
tolerations = ds_json['spec']['template']['spec'].get('tolerations') | ||
if tolerations: | ||
for toleration in tolerations: | ||
if GPU_TOLERATION == toleration: | ||
is_gpu_toleration_found = True | ||
break | ||
|
||
if is_gpu_toleration_found: | ||
debug_print('GPU toleration already found in ds') | ||
return | ||
|
||
add_gpu_toleration(ds_json) | ||
|
||
def edit_probe(ds_json, probe_name): | ||
debug_print('Editing {} for ds'.format(probe_name)) | ||
probe = ds_json['spec']['template']['spec']['containers'][0].get(probe_name) | ||
if not probe: | ||
ds_json['spec']['template']['spec']['containers'][0][probe_name] = {} | ||
|
||
ds_json['spec']['template']['spec']['containers'][0][probe_name]['failureThreshold'] = 20 | ||
ds_json['spec']['template']['spec']['containers'][0][probe_name]['initialDelaySeconds'] = 120 | ||
ds_json['spec']['template']['spec']['containers'][0][probe_name]['periodSeconds'] = 30 | ||
|
||
def edit_probes(ds_json): | ||
edit_probe(ds_json, 'livenessProbe') | ||
edit_probe(ds_json, 'readinessProbe') | ||
|
||
################ runaiconfig ################ | ||
def patch_runaiconfig(dcgm_exporter_namespace): | ||
debug_print('Patching runaiconfig with dcgm-exporter namespace') | ||
patch_command = 'kubectl patch RunaiConfig runai -n runai -p \'{"spec": {"global": {"nvidiaDcgmExporter": {"namespace": "%s", "installedFromGpuOperator": false}}}}\' --type="merge"' % (dcgm_exporter_namespace, ) | ||
exec_string_command(patch_command) | ||
|
||
################ main ################ | ||
def parse_args(): | ||
if len(sys.argv) < 3: | ||
exit('Please provide the runai-version and dcgm-exporter namespace as arguments for the script, for example:\n'+ | ||
'"python3 gke_patches.py 2.4 <DCGM_NAMESPACE>"') | ||
|
||
version_arg = sys.argv[1] | ||
try: | ||
version = float(version_arg) | ||
except ValueError: | ||
version = 0 | ||
|
||
if version < 2.4: | ||
exit('Valid versions are: 2.4, 2.5..., for example:\n"python3 gke_patches.py 2.4 <DCGM_NAMESPACE>"') | ||
|
||
dcgm_exporter_namespace = sys.argv[2] | ||
return version, dcgm_exporter_namespace | ||
|
||
def patch_for_gke(version, dcgm_exporter_namespace): | ||
ds_to_patch = [Gfd(version), Nfd(version), DcgmExporter(dcgm_exporter_namespace)] | ||
for ds in ds_to_patch: | ||
ds.patch() | ||
|
||
patch_runaiconfig(dcgm_exporter_namespace) | ||
|
||
def main(): | ||
version, dcgm_exporter_namespace = parse_args() | ||
patch_for_gke(version, dcgm_exporter_namespace) | ||
|
||
if __name__ == "__main__": | ||
main() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I saw what you did in here. The only problem I have is with the hard adding of the text. I mean you are adding
right after |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
import sys | ||
import subprocess | ||
|
||
|
||
def exec_command(command): | ||
output = subprocess.run(command.split(), stdout=subprocess.PIPE) | ||
return str(output.stdout, 'utf-8') if output is not None else "" | ||
|
||
def create_monitoring_ns(): | ||
print("Creating namespace monitoring") | ||
create_monitoring_ns_command = 'kubectl create namespace monitoring' | ||
exec_command(create_monitoring_ns_command) | ||
|
||
def install_gfd_if_needed(version): | ||
if version < 2.5: | ||
print("No need to install gfd for version: {}".format(version)) | ||
return | ||
|
||
install_gfd_commands = [ | ||
'helm repo add nvgfd https://nvidia.github.io/gpu-feature-discovery', | ||
'helm repo update', | ||
'helm install --version=0.5.0 gpu-feature-discovery nvgfd/gpu-feature-discovery' | ||
] | ||
|
||
for command in install_gfd_commands: | ||
exec_command(command) | ||
|
||
def parse_args(): | ||
if len(sys.argv) < 2: | ||
exit('Please provide the runai-version as an argument for the script, for example:\n'+ | ||
'"python3 pre_runai_script.py 2.4"') | ||
|
||
version_arg = sys.argv[1] | ||
try: | ||
version = float(version_arg) | ||
except ValueError: | ||
version = 0 | ||
|
||
if version < 2.4: | ||
exit('Valid versions are: 2.4, 2.5..., for example:\n"python3 gke_patches.py 2.4 <DCGM_NAMESPACE>"') | ||
|
||
return version | ||
|
||
def main(): | ||
version = parse_args() | ||
|
||
create_monitoring_ns() | ||
install_gfd_if_needed(version) | ||
|
||
if __name__ == "__main__": | ||
main() |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You dont need all this file if line 20 is the only change from the defaults
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I wonder if you need a file at all if you change only one field. It can be changed directly form the CLI
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I do need it because this is the values file we are using for helm install, and we can't omit any of those fields.
If you want, we can use the default installation and then change all those values ourselves with the python script.