-
Notifications
You must be signed in to change notification settings - Fork 2
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Run 790 add gke installation script #14
base: main
Are you sure you want to change the base?
Changes from 2 commits
3b15096
96b160c
96a7d94
f7fe09e
3e1f662
a535dc3
1a4e799
bbb532a
865863b
43a30d3
0e04dac
d7aa51f
b462042
35a045e
a2e6fa9
73ed8da
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
image: | ||
repository: nvcr.io/nvidia/k8s/dcgm-exporter | ||
pullPolicy: IfNotPresent | ||
tag: 2.3.4-2.6.4-ubuntu20.04 | ||
|
||
arguments: ["--kubernetes-gpu-id-type", "device-name"] | ||
|
||
imagePullSecrets: [] | ||
nameOverride: "" | ||
fullnameOverride: "" | ||
|
||
serviceAccount: | ||
create: true | ||
annotations: {} | ||
name: | ||
|
||
podAnnotations: {} | ||
podSecurityContext: {} | ||
|
||
securityContext: | ||
runAsNonRoot: false | ||
runAsUser: 0 | ||
capabilities: | ||
add: ["SYS_ADMIN"] | ||
privileged: true | ||
|
||
service: | ||
enable: true | ||
type: ClusterIP | ||
port: 9400 | ||
address: ":9400" | ||
annotations: {} | ||
|
||
resources: {} | ||
serviceMonitor: | ||
enabled: false | ||
interval: 15s | ||
additionalLabels: {} | ||
|
||
mapPodsMetrics: false | ||
|
||
nodeSelector: | ||
feature.node.kubernetes.io/pci-10de.present: "true" | ||
|
||
tolerations: | ||
- effect: NoSchedule | ||
key: nvidia.com/gpu | ||
operator: Exists | ||
|
||
affinity: {} | ||
|
||
extraHostVolumes: [] | ||
extraConfigMapVolumes: [] | ||
extraVolumeMounts: [] | ||
extraEnv: [] | ||
|
||
kubeletPath: "/var/lib/kubelet/pod-resources" |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,120 @@ | ||
import os | ||
import sys | ||
import subprocess | ||
|
||
|
||
DCGM_EXPORTER_NAMESPACE = 'runai' | ||
DEBUG = True | ||
|
||
NVIDIA_VOLUME = ''' | ||
- hostPath: | ||
path: /home/kubernetes/bin/nvidia | ||
type: Directory | ||
name: nvidia | ||
''' | ||
NVIDIA_VOLUME_MOUNT = ''' | ||
- mountPath: /usr/local/nvidia | ||
name: nvidia | ||
''' | ||
|
||
|
||
################ General Functions ################ | ||
def debug_print(str_to_print): | ||
if DEBUG: | ||
print(str_to_print) | ||
|
||
def write_to_file(file_content, file_path): | ||
with open(file_path, 'w') as f: | ||
f.write(file_content) | ||
|
||
def exec_command(command): | ||
output = subprocess.run(command.split(), stdout=subprocess.PIPE) | ||
return str(output.stdout, 'utf-8') if output is not None else "" | ||
|
||
def exec_string_command(string_command): | ||
output = subprocess.run(string_command, stdout=subprocess.PIPE, shell=True) | ||
return str(output.stdout, 'utf-8') if output is not None else "" | ||
|
||
def apply_yaml(yaml_content): | ||
yaml_filepath = '/tmp/yaml_to_deploy.yaml' | ||
write_to_file(yaml_content, yaml_filepath) | ||
|
||
apply_yaml_command = 'kubectl apply -f {}'.format(yaml_filepath) | ||
exec_command(apply_yaml_command) | ||
|
||
os.remove(yaml_filepath) | ||
|
||
def add_nvidia_volumes(gfd_yaml_line): | ||
if 'volumeMounts:' in gfd_yaml_line: | ||
debug_print('Adding nvidia volume mount') | ||
return gfd_yaml_line + NVIDIA_VOLUME_MOUNT | ||
elif 'volumes:' in gfd_yaml_line: | ||
debug_print('Adding nvidia volume') | ||
return gfd_yaml_line + NVIDIA_VOLUME | ||
return gfd_yaml_line | ||
|
||
################ gpu-feature-discovery ################ | ||
def remove_priority_class(gfd_yaml_line): | ||
if 'priorityClassName' in gfd_yaml_line: | ||
debug_print('Removing priorityClassName from gpu-feature-discovery') | ||
return " priorityClassName: null" | ||
return gfd_yaml_line | ||
|
||
def get_gfd_yaml(): | ||
debug_print('Getting gpu-feature-discovery yaml') | ||
get_gfd_yaml_command = 'kubectl get ds runai-cluster-gpu-feature-discovery -n node-feature-discovery -oyaml' | ||
return exec_command(get_gfd_yaml_command) | ||
|
||
def edit_gfd_yaml(gfd_yaml): | ||
edited_gfd = '' | ||
for line in gfd_yaml.splitlines(): | ||
edited_line = remove_priority_class(line) | ||
edited_line = add_nvidia_volumes(edited_line) | ||
edited_gfd += edited_line + '\n' | ||
|
||
return edited_gfd | ||
|
||
def edit_gfd(): | ||
gfd_yaml = get_gfd_yaml() | ||
gfd_yaml = edit_gfd_yaml(gfd_yaml) | ||
debug_print('Applying edited gpu-feature-discovery') | ||
apply_yaml(gfd_yaml) | ||
|
||
################ dcgm-exporter ################ | ||
def get_dcgm_exporter_namespace_from_args(): | ||
return sys.argv[1] if len(sys.argv) > 1 else DCGM_EXPORTER_NAMESPACE | ||
|
||
def get_dcgm_exporter_yaml(dcgm_exporter_namespace): | ||
debug_print('Getting dcgm-exporter yaml') | ||
get_dcgm_exporter_yaml_command = 'kubectl get ds dcgm-exporter -n {} -oyaml'.format(dcgm_exporter_namespace) | ||
return exec_command(get_dcgm_exporter_yaml_command) | ||
|
||
def edit_dcgm_exporter_yaml(dcgm_exporter_yaml): | ||
edited_dcgm_exporter = '' | ||
for line in dcgm_exporter_yaml.splitlines(): | ||
edited_line = add_nvidia_volumes(line) | ||
edited_dcgm_exporter += edited_line + '\n' | ||
|
||
return edited_dcgm_exporter | ||
|
||
def edit_dcgm_exporter(dcgm_exporter_namespace): | ||
dcgm_exporter_yaml = get_dcgm_exporter_yaml(dcgm_exporter_namespace) | ||
dcgm_exporter_yaml = edit_dcgm_exporter_yaml(dcgm_exporter_yaml) | ||
debug_print('Applying edited dcgm-exporter') | ||
apply_yaml(dcgm_exporter_yaml) | ||
|
||
################ dcgm-exporter ################ | ||
def patch_runaiconfig(dcgm_exporter_namespace): | ||
debug_print('Patching runaiconfig with dcgm-exporter namespace') | ||
patch_command = 'kubectl patch RunaiConfig runai -n runai -p \'{"spec": {"global": {"gpuOperator": {"namespace": "%s"}}}}\' --type="merge"' % (dcgm_exporter_namespace, ) | ||
exec_string_command(patch_command) | ||
|
||
def main(): | ||
dcgm_exporter_namespace = get_dcgm_exporter_namespace_from_args() | ||
|
||
edit_gfd() | ||
edit_dcgm_exporter(dcgm_exporter_namespace) | ||
patch_runaiconfig(dcgm_exporter_namespace) | ||
|
||
if __name__ == "__main__": | ||
main() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I saw what you did in here. The only problem I have is with the hard adding of the text. I mean you are adding
right after |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You dont need all this file if line 20 is the only change from the defaults
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I wonder if you need a file at all if you change only one field. It can be changed directly form the CLI
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I do need it because this is the values file we are using for helm install, and we can't omit any of those fields.
If you want, we can use the default installation and then change all those values ourselves with the python script.