Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Run 790 add gke installation script #14

Open
wants to merge 16 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 71 additions & 0 deletions gke/dcgm-exporter-values.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

image:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You dont need all this file if line 20 is the only change from the defaults

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder if you need a file at all if you change only one field. It can be changed directly form the CLI

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I do need it because this is the values file we are using for helm install, and we can't omit any of those fields.
If you want, we can use the default installation and then change all those values ourselves with the python script.

repository: nvcr.io/nvidia/k8s/dcgm-exporter
pullPolicy: IfNotPresent
tag: 2.3.4-2.6.4-ubuntu20.04

arguments: ["--kubernetes-gpu-id-type", "device-name"]

imagePullSecrets: []
nameOverride: ""
fullnameOverride: ""

serviceAccount:
create: true
annotations: {}
name:

podAnnotations: {}
podSecurityContext: {}

securityContext:
runAsNonRoot: false
runAsUser: 0
capabilities:
add: ["SYS_ADMIN"]
privileged: true

service:
enable: true
type: ClusterIP
port: 9400
address: ":9400"
annotations: {}

resources: {}
serviceMonitor:
enabled: false
interval: 15s
additionalLabels: {}

mapPodsMetrics: false

nodeSelector:
feature.node.kubernetes.io/pci-10de.present: "true"

tolerations:
- effect: NoSchedule
key: nvidia.com/gpu
operator: Exists

affinity: {}

extraHostVolumes: []
extraConfigMapVolumes: []
extraVolumeMounts: []
extraEnv: []

kubeletPath: "/var/lib/kubelet/pod-resources"
120 changes: 120 additions & 0 deletions gke/gke_patches.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
import os
import sys
import subprocess


DCGM_EXPORTER_NAMESPACE = 'runai'
DEBUG = True

NVIDIA_VOLUME = '''
- hostPath:
path: /home/kubernetes/bin/nvidia
type: Directory
name: nvidia
'''
NVIDIA_VOLUME_MOUNT = '''
- mountPath: /usr/local/nvidia
name: nvidia
'''


################ General Functions ################
def debug_print(str_to_print):
if DEBUG:
print(str_to_print)

def write_to_file(file_content, file_path):
with open(file_path, 'w') as f:
f.write(file_content)

def exec_command(command):
output = subprocess.run(command.split(), stdout=subprocess.PIPE)
return str(output.stdout, 'utf-8') if output is not None else ""

def exec_string_command(string_command):
output = subprocess.run(string_command, stdout=subprocess.PIPE, shell=True)
return str(output.stdout, 'utf-8') if output is not None else ""

def apply_yaml(yaml_content):
yaml_filepath = '/tmp/yaml_to_deploy.yaml'
write_to_file(yaml_content, yaml_filepath)

apply_yaml_command = 'kubectl apply -f {}'.format(yaml_filepath)
exec_command(apply_yaml_command)

os.remove(yaml_filepath)

def add_nvidia_volumes(gfd_yaml_line):
if 'volumeMounts:' in gfd_yaml_line:
debug_print('Adding nvidia volume mount')
return gfd_yaml_line + NVIDIA_VOLUME_MOUNT
elif 'volumes:' in gfd_yaml_line:
debug_print('Adding nvidia volume')
return gfd_yaml_line + NVIDIA_VOLUME
return gfd_yaml_line

################ gpu-feature-discovery ################
def remove_priority_class(gfd_yaml_line):
if 'priorityClassName' in gfd_yaml_line:
debug_print('Removing priorityClassName from gpu-feature-discovery')
return " priorityClassName: null"
return gfd_yaml_line

def get_gfd_yaml():
debug_print('Getting gpu-feature-discovery yaml')
get_gfd_yaml_command = 'kubectl get ds runai-cluster-gpu-feature-discovery -n node-feature-discovery -oyaml'
return exec_command(get_gfd_yaml_command)

def edit_gfd_yaml(gfd_yaml):
edited_gfd = ''
for line in gfd_yaml.splitlines():
edited_line = remove_priority_class(line)
edited_line = add_nvidia_volumes(edited_line)
edited_gfd += edited_line + '\n'

return edited_gfd

def edit_gfd():
gfd_yaml = get_gfd_yaml()
gfd_yaml = edit_gfd_yaml(gfd_yaml)
debug_print('Applying edited gpu-feature-discovery')
apply_yaml(gfd_yaml)

################ dcgm-exporter ################
def get_dcgm_exporter_namespace_from_args():
return sys.argv[1] if len(sys.argv) > 1 else DCGM_EXPORTER_NAMESPACE

def get_dcgm_exporter_yaml(dcgm_exporter_namespace):
debug_print('Getting dcgm-exporter yaml')
get_dcgm_exporter_yaml_command = 'kubectl get ds dcgm-exporter -n {} -oyaml'.format(dcgm_exporter_namespace)
return exec_command(get_dcgm_exporter_yaml_command)

def edit_dcgm_exporter_yaml(dcgm_exporter_yaml):
edited_dcgm_exporter = ''
for line in dcgm_exporter_yaml.splitlines():
edited_line = add_nvidia_volumes(line)
edited_dcgm_exporter += edited_line + '\n'

return edited_dcgm_exporter

def edit_dcgm_exporter(dcgm_exporter_namespace):
dcgm_exporter_yaml = get_dcgm_exporter_yaml(dcgm_exporter_namespace)
dcgm_exporter_yaml = edit_dcgm_exporter_yaml(dcgm_exporter_yaml)
debug_print('Applying edited dcgm-exporter')
apply_yaml(dcgm_exporter_yaml)

################ dcgm-exporter ################
def patch_runaiconfig(dcgm_exporter_namespace):
debug_print('Patching runaiconfig with dcgm-exporter namespace')
patch_command = 'kubectl patch RunaiConfig runai -n runai -p \'{"spec": {"global": {"gpuOperator": {"namespace": "%s"}}}}\' --type="merge"' % (dcgm_exporter_namespace, )
exec_string_command(patch_command)

def main():
dcgm_exporter_namespace = get_dcgm_exporter_namespace_from_args()

edit_gfd()
edit_dcgm_exporter(dcgm_exporter_namespace)
patch_runaiconfig(dcgm_exporter_namespace)

if __name__ == "__main__":
main()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I saw what you did in here. The only problem I have is with the hard adding of the text. I mean you are adding

          path: /home/kubernetes/bin/nvidia
          type: Directory
        name: nvidia

right after volumeMounts:
But who guarentee they are going to be in there? And that the indentetion will stay the same in all versions?
If you already write it in python why not work directly with kubernetes client?
Or alternatively parse the string into yaml, add the field and then serialize it to yaml?