diff --git a/cmd/experimental/kjobctl/pkg/builder/builder.go b/cmd/experimental/kjobctl/pkg/builder/builder.go index 32c78bfb81..e8f3d27e1f 100644 --- a/cmd/experimental/kjobctl/pkg/builder/builder.go +++ b/cmd/experimental/kjobctl/pkg/builder/builder.go @@ -537,6 +537,12 @@ func (b *Builder) buildObjectMeta(templateObjectMeta metav1.ObjectMeta) metav1.O objectMeta.Labels[constants.ModeLabel] = string(b.mode.Name) } + return objectMeta +} + +func (b *Builder) buildObjectMetaWithKueueLabels(templateObjectMeta metav1.ObjectMeta) metav1.ObjectMeta { + objectMeta := b.buildObjectMeta(templateObjectMeta) + if len(b.localQueue) > 0 { objectMeta.Labels[kueueconstants.QueueLabel] = b.localQueue } diff --git a/cmd/experimental/kjobctl/pkg/builder/interactive_builder.go b/cmd/experimental/kjobctl/pkg/builder/interactive_builder.go index 081224866a..465bdf5429 100644 --- a/cmd/experimental/kjobctl/pkg/builder/interactive_builder.go +++ b/cmd/experimental/kjobctl/pkg/builder/interactive_builder.go @@ -41,7 +41,7 @@ func (b *interactiveBuilder) build(ctx context.Context) (runtime.Object, []runti Kind: "Pod", APIVersion: "v1", }, - ObjectMeta: b.buildObjectMeta(template.Template.ObjectMeta), + ObjectMeta: b.buildObjectMetaWithKueueLabels(template.Template.ObjectMeta), Spec: template.Template.Spec, } diff --git a/cmd/experimental/kjobctl/pkg/builder/job_builder.go b/cmd/experimental/kjobctl/pkg/builder/job_builder.go index 5443f05122..6a9a55b027 100644 --- a/cmd/experimental/kjobctl/pkg/builder/job_builder.go +++ b/cmd/experimental/kjobctl/pkg/builder/job_builder.go @@ -42,7 +42,7 @@ func (b *jobBuilder) build(ctx context.Context) (runtime.Object, []runtime.Objec Kind: "Job", APIVersion: "batch/v1", }, - ObjectMeta: b.buildObjectMeta(template.Template.ObjectMeta), + ObjectMeta: b.buildObjectMetaWithKueueLabels(template.Template.ObjectMeta), Spec: template.Template.Spec, } diff --git a/cmd/experimental/kjobctl/pkg/builder/ray_cluster_builder.go b/cmd/experimental/kjobctl/pkg/builder/ray_cluster_builder.go index 7a9b719213..ad48dec431 100644 --- a/cmd/experimental/kjobctl/pkg/builder/ray_cluster_builder.go +++ b/cmd/experimental/kjobctl/pkg/builder/ray_cluster_builder.go @@ -42,7 +42,7 @@ func (b *rayClusterBuilder) build(ctx context.Context) (runtime.Object, []runtim Kind: "RayCluster", APIVersion: "ray.io/v1", }, - ObjectMeta: b.buildObjectMeta(template.Template.ObjectMeta), + ObjectMeta: b.buildObjectMetaWithKueueLabels(template.Template.ObjectMeta), Spec: template.Template.Spec, } diff --git a/cmd/experimental/kjobctl/pkg/builder/ray_job_builder.go b/cmd/experimental/kjobctl/pkg/builder/ray_job_builder.go index 9cfcb6da05..0788e01917 100644 --- a/cmd/experimental/kjobctl/pkg/builder/ray_job_builder.go +++ b/cmd/experimental/kjobctl/pkg/builder/ray_job_builder.go @@ -45,7 +45,7 @@ func (b *rayJobBuilder) build(ctx context.Context) (runtime.Object, []runtime.Ob Kind: "RayJob", APIVersion: "ray.io/v1", }, - ObjectMeta: b.buildObjectMeta(template.Template.ObjectMeta), + ObjectMeta: b.buildObjectMetaWithKueueLabels(template.Template.ObjectMeta), Spec: template.Template.Spec, } diff --git a/cmd/experimental/kjobctl/pkg/builder/slurm_builder.go b/cmd/experimental/kjobctl/pkg/builder/slurm_builder.go index 5bda7511e3..baa6095d67 100644 --- a/cmd/experimental/kjobctl/pkg/builder/slurm_builder.go +++ b/cmd/experimental/kjobctl/pkg/builder/slurm_builder.go @@ -81,6 +81,8 @@ var ( type slurmBuilder struct { *Builder + jobTemplate *v1alpha1.JobTemplate + objectName string scriptContent string template *template.Template @@ -109,7 +111,14 @@ func (b *slurmBuilder) validateGeneral() error { return nil } -func (b *slurmBuilder) complete() error { +func (b *slurmBuilder) complete(ctx context.Context) error { + jobTemplate, err := b.kjobctlClientset.KjobctlV1alpha1().JobTemplates(b.profile.Namespace). + Get(ctx, string(b.mode.Template), metav1.GetOptions{}) + if err != nil { + return err + } + b.jobTemplate = jobTemplate + content, err := os.ReadFile(b.script) if err != nil { return err @@ -186,24 +195,18 @@ func (b *slurmBuilder) build(ctx context.Context) (runtime.Object, []runtime.Obj return nil, nil, err } - if err := b.complete(); err != nil { + if err := b.complete(ctx); err != nil { return nil, nil, err } - template, err := b.kjobctlClientset.KjobctlV1alpha1().JobTemplates(b.profile.Namespace). - Get(ctx, string(b.mode.Template), metav1.GetOptions{}) - if err != nil { - return nil, nil, err - } - - objectMeta := b.buildObjectMeta(template.Template.ObjectMeta) + objectMeta := b.buildObjectMetaWithKueueLabels(b.jobTemplate.Template.ObjectMeta) objectMeta.GenerateName = "" objectMeta.Name = b.objectName job := &batchv1.Job{ TypeMeta: metav1.TypeMeta{Kind: "Job", APIVersion: "batch/v1"}, ObjectMeta: objectMeta, - Spec: template.Template.Spec, + Spec: b.jobTemplate.Template.Spec, } job.Spec.CompletionMode = ptr.To(batchv1.IndexedCompletion) job.Spec.Template.Spec.Subdomain = b.objectName @@ -215,7 +218,7 @@ func (b *slurmBuilder) build(ctx context.Context) (runtime.Object, []runtime.Obj VolumeSource: corev1.VolumeSource{ ConfigMap: &corev1.ConfigMapVolumeSource{ LocalObjectReference: corev1.LocalObjectReference{ - Name: b.objectName, + Name: fmt.Sprintf("%s-scripts", b.objectName), }, Items: []corev1.KeyToPath{ { @@ -255,6 +258,13 @@ func (b *slurmBuilder) build(ctx context.Context) (runtime.Object, []runtime.Obj }, }, }, + EnvFrom: []corev1.EnvFromSource{{ + ConfigMapRef: &corev1.ConfigMapEnvSource{ + LocalObjectReference: corev1.LocalObjectReference{ + Name: fmt.Sprintf("%s-env", b.objectName), + }, + }, + }}, VolumeMounts: []corev1.VolumeMount{ { Name: "slurm-scripts", @@ -386,6 +396,13 @@ func (b *slurmBuilder) build(ctx context.Context) (runtime.Object, []runtime.Obj Name: "JOB_CONTAINER_INDEX", Value: strconv.FormatInt(int64(i), 10), }) + job.Spec.Template.Spec.Containers[i].EnvFrom = append(job.Spec.Template.Spec.Containers[i].EnvFrom, corev1.EnvFromSource{ + ConfigMapRef: &corev1.ConfigMapEnvSource{ + LocalObjectReference: corev1.LocalObjectReference{ + Name: fmt.Sprintf("%s-env", b.objectName), + }, + }, + }) } if b.nodes != nil { @@ -411,17 +428,30 @@ func (b *slurmBuilder) build(ctx context.Context) (runtime.Object, []runtime.Obj b.cpusPerGpu = resource.NewQuantity(cpusPerGpu, b.cpusOnNode.Format) } - initEntrypointScript, err := b.buildInitEntrypointScript() + scriptsConfigMap, err := b.buildScriptsConfigMap() if err != nil { return nil, nil, err } + return job, []runtime.Object{scriptsConfigMap, b.buildEnvConfigMap(), b.buildService()}, nil +} + +func (b *slurmBuilder) buildScriptsConfigMap() (*corev1.ConfigMap, error) { + objectMeta := b.buildObjectMeta(b.jobTemplate.Template.ObjectMeta) + objectMeta.GenerateName = "" + objectMeta.Name = fmt.Sprintf("%s-scripts", b.objectName) + + initEntrypointScript, err := b.buildInitEntrypointScript() + if err != nil { + return nil, err + } + entrypointScript, err := b.buildEntrypointScript() if err != nil { - return nil, nil, err + return nil, err } - configMap := &corev1.ConfigMap{ + return &corev1.ConfigMap{ TypeMeta: metav1.TypeMeta{Kind: "ConfigMap", APIVersion: "v1"}, ObjectMeta: objectMeta, Data: map[string]string{ @@ -429,9 +459,56 @@ func (b *slurmBuilder) build(ctx context.Context) (runtime.Object, []runtime.Obj slurmEntrypointFilename: entrypointScript, slurmScriptFilename: b.scriptContent, }, + }, nil +} + +func (b *slurmBuilder) buildEnvConfigMap() *corev1.ConfigMap { + objectMeta := b.buildObjectMeta(b.jobTemplate.Template.ObjectMeta) + objectMeta.GenerateName = "" + objectMeta.Name = fmt.Sprintf("%s-env", b.objectName) + + nTasks := ptr.Deref(b.nTasks, 1) + nodes := ptr.Deref(b.nodes, 1) + + nodeList := make([]string, nodes) + for i := int32(0); i < nodes; i++ { + nodeList[i] = fmt.Sprintf("%s-%d.%s", b.objectName, i, b.objectName) } - service := &corev1.Service{ + return &corev1.ConfigMap{ + TypeMeta: metav1.TypeMeta{Kind: "ConfigMap", APIVersion: "v1"}, + ObjectMeta: objectMeta, + Data: map[string]string{ + "SLURM_ARRAY_JOB_ID": fmt.Sprint(slurmArrayJobID), + "SLURM_ARRAY_TASK_COUNT": fmt.Sprint(b.arrayIndexes.Count()), + "SLURM_ARRAY_TASK_MAX": fmt.Sprint(b.arrayIndexes.Max()), + "SLURM_ARRAY_TASK_MIN": fmt.Sprint(b.arrayIndexes.Min()), + "SLURM_TASKS_PER_NODE": fmt.Sprint(nTasks), + "SLURM_CPUS_PER_TASK": getValueOrEmpty(b.cpusPerTask), + "SLURM_CPUS_ON_NODE": getValueOrEmpty(b.cpusOnNode), + "SLURM_JOB_CPUS_PER_NODE": getValueOrEmpty(b.cpusOnNode), + "SLURM_CPUS_PER_GPU": getValueOrEmpty(b.cpusPerGpu), + "SLURM_MEM_PER_CPU": getValueOrEmpty(b.memPerCPU), + "SLURM_MEM_PER_GPU": getValueOrEmpty(b.memPerGPU), + "SLURM_MEM_PER_NODE": getValueOrEmpty(b.totalMemPerNode), + "SLURM_GPUS": getValueOrEmpty(b.totalGpus), + "SLURM_NTASKS": fmt.Sprint(nTasks), + "SLURM_NTASKS_PER_NODE": fmt.Sprint(nTasks), + "SLURM_NPROCS": fmt.Sprint(nTasks), + "SLURM_NNODES": fmt.Sprint(nodes), + "SLURM_SUBMIT_DIR": slurmScriptsPath, + "SLURM_JOB_NODELIST": strings.Join(nodeList, ","), + "SLURM_JOB_FIRST_NODE": nodeList[0], + }, + } +} + +func (b *slurmBuilder) buildService() *corev1.Service { + objectMeta := b.buildObjectMeta(b.jobTemplate.Template.ObjectMeta) + objectMeta.GenerateName = "" + objectMeta.Name = b.objectName + + return &corev1.Service{ TypeMeta: metav1.TypeMeta{Kind: "Service", APIVersion: "v1"}, ObjectMeta: objectMeta, Spec: corev1.ServiceSpec{ @@ -441,8 +518,6 @@ func (b *slurmBuilder) build(ctx context.Context) (runtime.Object, []runtime.Obj }, }, } - - return job, []runtime.Object{configMap, service}, nil } func (b *slurmBuilder) buildArrayIndexes() string { @@ -480,40 +555,11 @@ type slurmInitEntrypointScript struct { EnvsPath string SlurmEnvFilename string - SlurmArrayJobID int32 - SlurmArrayTaskCount int32 - SlurmArrayTaskMax int32 - SlurmArrayTaskMin int32 - SlurmTasksPerNode int32 - SlurmCPUsPerTask string - SlurmCPUsOnNode string - SlurmJobCPUsPerNode string - SlurmCPUsPerGPU string - SlurmMemPerCPU string - SlurmMemPerGPU string - SlurmMemPerNode string - SlurmGPUs string - SlurmNTasks int32 - SlurmNTasksPerNode int32 - SlurmNProcs int32 - SlurmNNodes int32 - SlurmSubmitDir string - SlurmJobNodeList string - SlurmJobFirstNode string - FirstNodeIP bool FirstNodeIPTimeoutSeconds int32 } func (b *slurmBuilder) buildInitEntrypointScript() (string, error) { - nTasks := ptr.Deref(b.nTasks, 1) - nodes := ptr.Deref(b.nodes, 1) - - nodeList := make([]string, nodes) - for i := int32(0); i < nodes; i++ { - nodeList[i] = fmt.Sprintf("%s-%d.%s", b.objectName, i, b.objectName) - } - scriptValues := slurmInitEntrypointScript{ ArrayIndexes: b.buildArrayIndexes(), @@ -523,27 +569,6 @@ func (b *slurmBuilder) buildInitEntrypointScript() (string, error) { EnvsPath: slurmEnvsPath, SlurmEnvFilename: slurmSlurmEnvFilename, - SlurmArrayJobID: slurmArrayJobID, - SlurmArrayTaskCount: int32(b.arrayIndexes.Count()), - SlurmArrayTaskMax: b.arrayIndexes.Max(), - SlurmArrayTaskMin: b.arrayIndexes.Min(), - SlurmTasksPerNode: nTasks, - SlurmCPUsPerTask: getValueOrEmpty(b.cpusPerTask), - SlurmCPUsOnNode: getValueOrEmpty(b.cpusOnNode), - SlurmJobCPUsPerNode: getValueOrEmpty(b.cpusOnNode), - SlurmCPUsPerGPU: getValueOrEmpty(b.cpusPerGpu), - SlurmMemPerCPU: getValueOrEmpty(b.memPerCPU), - SlurmMemPerGPU: getValueOrEmpty(b.memPerGPU), - SlurmMemPerNode: getValueOrEmpty(b.totalMemPerNode), - SlurmGPUs: getValueOrEmpty(b.totalGpus), - SlurmNTasks: nTasks, - SlurmNTasksPerNode: nTasks, - SlurmNProcs: nTasks, - SlurmNNodes: nodes, - SlurmSubmitDir: slurmScriptsPath, - SlurmJobNodeList: strings.Join(nodeList, ","), - SlurmJobFirstNode: nodeList[0], - FirstNodeIP: b.firstNodeIP, FirstNodeIPTimeoutSeconds: int32(b.firstNodeIPTimeout.Seconds()), } diff --git a/cmd/experimental/kjobctl/pkg/builder/slurm_builder_test.go b/cmd/experimental/kjobctl/pkg/builder/slurm_builder_test.go index b1b7b01d60..86ef1924b8 100644 --- a/cmd/experimental/kjobctl/pkg/builder/slurm_builder_test.go +++ b/cmd/experimental/kjobctl/pkg/builder/slurm_builder_test.go @@ -137,13 +137,13 @@ func TestSlurmBuilderDo(t *testing.T) { WithSupportedMode(*wrappers.MakeSupportedMode(v1alpha1.SlurmMode, "slurm-job-template").Obj()). Obj(), }, - wantRootObj: wrappers.MakeJob("", metav1.NamespaceDefault). + wantRootObj: wrappers.MakeJob("profile-slurm-c7rb4", metav1.NamespaceDefault). Parallelism(2). Completions(5). CompletionMode(batchv1.IndexedCompletion). Profile("profile"). Mode(v1alpha1.SlurmMode). - Subdomain("profile-slurm"). + Subdomain("profile-slurm-c7rb4"). WithInitContainer(*wrappers.MakeContainer("slurm-init-env", "bash:latest"). Command("sh", "/slurm/scripts/init-entrypoint.sh"). WithVolumeMount(corev1.VolumeMount{Name: "slurm-scripts", MountPath: "/slurm/scripts"}). @@ -166,11 +166,18 @@ func TestSlurmBuilderDo(t *testing.T) { WithEnvVar(corev1.EnvVar{Name: "TIMESTAMP", Value: testStartTime.Format(time.RFC3339)}). WithEnvVar(corev1.EnvVar{Name: "JOB_CONTAINER_INDEX", Value: "0"}). Obj()). + WithEnvFrom(corev1.EnvFromSource{ + ConfigMapRef: &corev1.ConfigMapEnvSource{ + LocalObjectReference: corev1.LocalObjectReference{ + Name: "profile-slurm-c7rb4-env", + }, + }, + }). WithVolume(corev1.Volume{ Name: "slurm-scripts", VolumeSource: corev1.VolumeSource{ ConfigMap: &corev1.ConfigMapVolumeSource{ - LocalObjectReference: corev1.LocalObjectReference{Name: "profile-slurm"}, + LocalObjectReference: corev1.LocalObjectReference{Name: "profile-slurm-c7rb4-scripts"}, Items: []corev1.KeyToPath{ {Key: "init-entrypoint.sh", Path: "init-entrypoint.sh"}, {Key: "entrypoint.sh", Path: "entrypoint.sh"}, @@ -187,7 +194,7 @@ func TestSlurmBuilderDo(t *testing.T) { }). Obj(), wantChildObjs: []runtime.Object{ - wrappers.MakeConfigMap("", metav1.NamespaceDefault). + wrappers.MakeConfigMap("profile-slurm-c7rb4-scripts", metav1.NamespaceDefault). Profile("profile"). Mode(v1alpha1.SlurmMode). Data(map[string]string{ @@ -206,7 +213,7 @@ set -x array_indexes="1;2;3;4;5" container_indexes=$(echo "$array_indexes" | awk -F';' -v idx="$JOB_COMPLETION_INDEX" '{print $((idx + 1))}') -for i in $(seq 0 1) +for i in $(seq 0 $SLURM_TASKS_PER_NODE) do container_index=$(echo "$container_indexes" | awk -F',' -v idx="$i" '{print $((idx + 1))}') @@ -218,29 +225,9 @@ do cat << EOF > /slurm/env/$i/slurm.env -SLURM_ARRAY_JOB_ID=1 -SLURM_ARRAY_TASK_COUNT=5 -SLURM_ARRAY_TASK_MAX=5 -SLURM_ARRAY_TASK_MIN=1 -SLURM_TASKS_PER_NODE=1 -SLURM_CPUS_PER_TASK= -SLURM_CPUS_ON_NODE= -SLURM_JOB_CPUS_PER_NODE= -SLURM_CPUS_PER_GPU= -SLURM_MEM_PER_CPU= -SLURM_MEM_PER_GPU= -SLURM_MEM_PER_NODE= -SLURM_GPUS= -SLURM_NTASKS=1 -SLURM_NTASKS_PER_NODE=1 -SLURM_NPROCS=1 -SLURM_NNODES=2 -SLURM_SUBMIT_DIR=/slurm/scripts SLURM_SUBMIT_HOST=$HOSTNAME -SLURM_JOB_NODELIST=profile-slurm-0.profile-slurm,profile-slurm-1.profile-slurm -SLURM_JOB_FIRST_NODE=profile-slurm-0.profile-slurm -SLURM_JOB_ID=$(expr $JOB_COMPLETION_INDEX \* 1 + $i + 1) -SLURM_JOBID=$(expr $JOB_COMPLETION_INDEX \* 1 + $i + 1) +SLURM_JOB_ID=$(expr $JOB_COMPLETION_INDEX \* $SLURM_TASKS_PER_NODE + $i + $SLURM_ARRAY_JOB_ID) +SLURM_JOBID=$(expr $JOB_COMPLETION_INDEX \* $SLURM_TASKS_PER_NODE + $i + $SLURM_ARRAY_JOB_ID) SLURM_ARRAY_TASK_ID=$container_index SLURM_JOB_FIRST_NODE_IP=${SLURM_JOB_FIRST_NODE_IP:-""} EOF @@ -268,11 +255,37 @@ export $(cat /slurm/env/$JOB_CONTAINER_INDEX/slurm.env | xargs) `, }). Obj(), - wrappers.MakeService("profile-slurm", metav1.NamespaceDefault). + wrappers.MakeConfigMap("profile-slurm-c7rb4-env", metav1.NamespaceDefault). + Profile("profile"). + Mode(v1alpha1.SlurmMode). + Data(map[string]string{ + "SLURM_ARRAY_JOB_ID": "1", + "SLURM_ARRAY_TASK_COUNT": "5", + "SLURM_ARRAY_TASK_MAX": "5", + "SLURM_ARRAY_TASK_MIN": "1", + "SLURM_TASKS_PER_NODE": "1", + "SLURM_CPUS_PER_TASK": "", + "SLURM_CPUS_ON_NODE": "", + "SLURM_JOB_CPUS_PER_NODE": "", + "SLURM_CPUS_PER_GPU": "", + "SLURM_MEM_PER_CPU": "", + "SLURM_MEM_PER_GPU": "", + "SLURM_MEM_PER_NODE": "", + "SLURM_GPUS": "", + "SLURM_NTASKS": "1", + "SLURM_NTASKS_PER_NODE": "1", + "SLURM_NPROCS": "1", + "SLURM_NNODES": "2", + "SLURM_SUBMIT_DIR": "/slurm/scripts", + "SLURM_JOB_NODELIST": "profile-slurm-mwnv4-0.profile-slurm-mwnv4,profile-slurm-mwnv4-1.profile-slurm-mwnv4", + "SLURM_JOB_FIRST_NODE": "profile-slurm-mwnv4-0.profile-slurm-mwnv4", + }). + Obj(), + wrappers.MakeService("profile-slurm-c7rb4", metav1.NamespaceDefault). Profile("profile"). Mode(v1alpha1.SlurmMode). ClusterIP("None"). - Selector("job-name", "profile-slurm"). + Selector("job-name", "profile-slurm-c7rb4"). Obj(), }, cmpopts: []cmp.Option{ @@ -343,14 +356,11 @@ export $(cat /slurm/env/$JOB_CONTAINER_INDEX/slurm.env | xargs) return } - defaultCmpOpts := []cmp.Option{cmpopts.IgnoreFields(metav1.ObjectMeta{}, "Name")} - opts = append(defaultCmpOpts, tc.cmpopts...) - - if diff := cmp.Diff(tc.wantRootObj, gotRootObj, opts...); diff != "" { + if diff := cmp.Diff(tc.wantRootObj, gotRootObj, tc.cmpopts...); diff != "" { t.Errorf("Root object after build (-want,+got):\n%s", diff) } - if diff := cmp.Diff(tc.wantChildObjs, gotChildObjs, opts...); diff != "" { + if diff := cmp.Diff(tc.wantChildObjs, gotChildObjs, tc.cmpopts...); diff != "" { t.Errorf("Child objects after build (-want,+got):\n%s", diff) } }) diff --git a/cmd/experimental/kjobctl/pkg/builder/templates/slurm_init_entrypoint_script.sh.tmpl b/cmd/experimental/kjobctl/pkg/builder/templates/slurm_init_entrypoint_script.sh.tmpl index 12c2be493a..f6763247ad 100644 --- a/cmd/experimental/kjobctl/pkg/builder/templates/slurm_init_entrypoint_script.sh.tmpl +++ b/cmd/experimental/kjobctl/pkg/builder/templates/slurm_init_entrypoint_script.sh.tmpl @@ -12,7 +12,7 @@ set -x array_indexes="{{.ArrayIndexes}}" container_indexes=$(echo "$array_indexes" | awk -F';' -v idx="$JOB_COMPLETION_INDEX" '{print $((idx + 1))}') -for i in $(seq 0 {{.SlurmNTasksPerNode}}) +for i in $(seq 0 $SLURM_TASKS_PER_NODE) do container_index=$(echo "$container_indexes" | awk -F',' -v idx="$i" '{print $((idx + 1))}') @@ -29,7 +29,7 @@ do timeout={{.FirstNodeIPTimeoutSeconds}} start_time=$(date +%s) while true; do - ip=$(nslookup "{{.SlurmJobFirstNode}}" | grep "Address 1" | awk 'NR==2 {print $3}') || true + ip=$(nslookup "$SLURM_JOB_FIRST_NODE" | grep "Address 1" | awk 'NR==2 {print $3}') || true if [[ -n "$ip" ]]; then SLURM_JOB_FIRST_NODE_IP=$ip break @@ -37,39 +37,19 @@ do current_time=$(date +%s) elapsed_time=$((current_time - start_time)) if [ "$elapsed_time" -ge "$timeout" ]; then - echo "Timeout reached, IP address for the first node ({{.SlurmJobFirstNode}}) not found." + echo "Timeout reached, IP address for the first node ($SLURM_JOB_FIRST_NODE) not found." break fi - echo "IP Address for the first node ({{.SlurmJobFirstNode}}) not found, retrying..." + echo "IP Address for the first node ($SLURM_JOB_FIRST_NODE) not found, retrying..." sleep 1 fi done fi {{end}} cat << EOF > {{.EnvsPath}}/$i/{{.SlurmEnvFilename}} -SLURM_ARRAY_JOB_ID={{.SlurmArrayJobID}} -SLURM_ARRAY_TASK_COUNT={{.SlurmArrayTaskCount}} -SLURM_ARRAY_TASK_MAX={{.SlurmArrayTaskMax}} -SLURM_ARRAY_TASK_MIN={{.SlurmArrayTaskMin}} -SLURM_TASKS_PER_NODE={{.SlurmTasksPerNode}} -SLURM_CPUS_PER_TASK={{.SlurmCPUsPerTask}} -SLURM_CPUS_ON_NODE={{.SlurmCPUsOnNode}} -SLURM_JOB_CPUS_PER_NODE={{.SlurmJobCPUsPerNode}} -SLURM_CPUS_PER_GPU={{.SlurmCPUsPerGPU}} -SLURM_MEM_PER_CPU={{.SlurmMemPerCPU}} -SLURM_MEM_PER_GPU={{.SlurmMemPerGPU}} -SLURM_MEM_PER_NODE={{.SlurmMemPerNode}} -SLURM_GPUS={{.SlurmGPUs}} -SLURM_NTASKS={{.SlurmNTasks}} -SLURM_NTASKS_PER_NODE={{.SlurmNTasksPerNode}} -SLURM_NPROCS={{.SlurmNProcs}} -SLURM_NNODES={{.SlurmNNodes}} -SLURM_SUBMIT_DIR={{.SlurmSubmitDir}} SLURM_SUBMIT_HOST=$HOSTNAME -SLURM_JOB_NODELIST={{.SlurmJobNodeList}} -SLURM_JOB_FIRST_NODE={{.SlurmJobFirstNode}} -SLURM_JOB_ID=$(expr $JOB_COMPLETION_INDEX \* {{.SlurmNTasksPerNode}} + $i + 1) -SLURM_JOBID=$(expr $JOB_COMPLETION_INDEX \* {{.SlurmNTasksPerNode}} + $i + 1) +SLURM_JOB_ID=$(expr $JOB_COMPLETION_INDEX \* $SLURM_TASKS_PER_NODE + $i + $SLURM_ARRAY_JOB_ID) +SLURM_JOBID=$(expr $JOB_COMPLETION_INDEX \* $SLURM_TASKS_PER_NODE + $i + $SLURM_ARRAY_JOB_ID) SLURM_ARRAY_TASK_ID=$container_index SLURM_JOB_FIRST_NODE_IP=${SLURM_JOB_FIRST_NODE_IP:-""} EOF diff --git a/cmd/experimental/kjobctl/pkg/cmd/create/create_test.go b/cmd/experimental/kjobctl/pkg/cmd/create/create_test.go index 136935f278..ecc644a59b 100644 --- a/cmd/experimental/kjobctl/pkg/cmd/create/create_test.go +++ b/cmd/experimental/kjobctl/pkg/cmd/create/create_test.go @@ -933,11 +933,18 @@ func TestCreateCmd(t *testing.T) { WithVolumeMount(corev1.VolumeMount{Name: "slurm-scripts", MountPath: "/slurm/scripts"}). WithVolumeMount(corev1.VolumeMount{Name: "slurm-env", MountPath: "/slurm/env"}). Obj()). + WithEnvFrom(corev1.EnvFromSource{ + ConfigMapRef: &corev1.ConfigMapEnvSource{ + LocalObjectReference: corev1.LocalObjectReference{ + Name: "profile-slurm-c7rb4-env", + }, + }, + }). WithVolume(corev1.Volume{ Name: "slurm-scripts", VolumeSource: corev1.VolumeSource{ ConfigMap: &corev1.ConfigMapVolumeSource{ - LocalObjectReference: corev1.LocalObjectReference{Name: "profile-slurm"}, + LocalObjectReference: corev1.LocalObjectReference{Name: "profile-slurm-c7rb4-scripts"}, Items: []corev1.KeyToPath{ {Key: "init-entrypoint.sh", Path: "init-entrypoint.sh"}, {Key: "entrypoint.sh", Path: "entrypoint.sh"}, @@ -967,9 +974,9 @@ func TestCreateCmd(t *testing.T) { &corev1.ConfigMapList{ TypeMeta: metav1.TypeMeta{Kind: "ConfigMapList", APIVersion: "v1"}, Items: []corev1.ConfigMap{ - *wrappers.MakeConfigMap("profile-slurm", metav1.NamespaceDefault). + *wrappers.MakeConfigMap("profile-slurm-c7rb4-scripts", metav1.NamespaceDefault). WithOwnerReference(metav1.OwnerReference{ - Name: "profile-slurm", + Name: "profile-slurm-c7rb4", APIVersion: "batch/v1", Kind: "Job", }). @@ -991,7 +998,7 @@ set -x array_indexes="0" container_indexes=$(echo "$array_indexes" | awk -F';' -v idx="$JOB_COMPLETION_INDEX" '{print $((idx + 1))}') -for i in $(seq 0 1) +for i in $(seq 0 $SLURM_TASKS_PER_NODE) do container_index=$(echo "$container_indexes" | awk -F',' -v idx="$i" '{print $((idx + 1))}') @@ -1003,29 +1010,9 @@ do cat << EOF > /slurm/env/$i/slurm.env -SLURM_ARRAY_JOB_ID=1 -SLURM_ARRAY_TASK_COUNT=1 -SLURM_ARRAY_TASK_MAX=0 -SLURM_ARRAY_TASK_MIN=0 -SLURM_TASKS_PER_NODE=1 -SLURM_CPUS_PER_TASK= -SLURM_CPUS_ON_NODE= -SLURM_JOB_CPUS_PER_NODE= -SLURM_CPUS_PER_GPU= -SLURM_MEM_PER_CPU= -SLURM_MEM_PER_GPU= -SLURM_MEM_PER_NODE= -SLURM_GPUS= -SLURM_NTASKS=1 -SLURM_NTASKS_PER_NODE=1 -SLURM_NPROCS=1 -SLURM_NNODES=1 -SLURM_SUBMIT_DIR=/slurm/scripts SLURM_SUBMIT_HOST=$HOSTNAME -SLURM_JOB_NODELIST=profile-slurm-0.profile-slurm -SLURM_JOB_FIRST_NODE=profile-slurm-0.profile-slurm -SLURM_JOB_ID=$(expr $JOB_COMPLETION_INDEX \* 1 + $i + 1) -SLURM_JOBID=$(expr $JOB_COMPLETION_INDEX \* 1 + $i + 1) +SLURM_JOB_ID=$(expr $JOB_COMPLETION_INDEX \* $SLURM_TASKS_PER_NODE + $i + $SLURM_ARRAY_JOB_ID) +SLURM_JOBID=$(expr $JOB_COMPLETION_INDEX \* $SLURM_TASKS_PER_NODE + $i + $SLURM_ARRAY_JOB_ID) SLURM_ARRAY_TASK_ID=$container_index SLURM_JOB_FIRST_NODE_IP=${SLURM_JOB_FIRST_NODE_IP:-""} EOF @@ -1053,18 +1040,49 @@ export $(cat /slurm/env/$JOB_CONTAINER_INDEX/slurm.env | xargs) `, }). Obj(), + *wrappers.MakeConfigMap("profile-slurm-c7rb4-env", metav1.NamespaceDefault). + WithOwnerReference(metav1.OwnerReference{ + Name: "profile-slurm-c7rb4", + APIVersion: "batch/v1", + Kind: "Job", + }). + Profile("profile"). + Mode(v1alpha1.SlurmMode). + Data(map[string]string{ + "SLURM_ARRAY_JOB_ID": "1", + "SLURM_ARRAY_TASK_COUNT": "1", + "SLURM_ARRAY_TASK_MAX": "0", + "SLURM_ARRAY_TASK_MIN": "0", + "SLURM_TASKS_PER_NODE": "1", + "SLURM_CPUS_PER_TASK": "", + "SLURM_CPUS_ON_NODE": "", + "SLURM_JOB_CPUS_PER_NODE": "", + "SLURM_CPUS_PER_GPU": "", + "SLURM_MEM_PER_CPU": "", + "SLURM_MEM_PER_GPU": "", + "SLURM_MEM_PER_NODE": "", + "SLURM_GPUS": "", + "SLURM_NTASKS": "1", + "SLURM_NTASKS_PER_NODE": "1", + "SLURM_NPROCS": "1", + "SLURM_NNODES": "1", + "SLURM_SUBMIT_DIR": "/slurm/scripts", + "SLURM_JOB_NODELIST": "profile-slurm-mwnv4-0.profile-slurm-mwnv4", + "SLURM_JOB_FIRST_NODE": "profile-slurm-mwnv4-0.profile-slurm-mwnv4", + }). + Obj(), }, }, &corev1.ServiceList{ TypeMeta: metav1.TypeMeta{Kind: "ServiceList", APIVersion: "v1"}, Items: []corev1.Service{ - *wrappers.MakeService("profile-slurm", metav1.NamespaceDefault). + *wrappers.MakeService("profile-slurm-c7rb4", metav1.NamespaceDefault). Profile("profile"). Mode(v1alpha1.SlurmMode). ClusterIP("None"). - Selector("job-name", "profile-slurm"). + Selector("job-name", "profile-slurm-c7rb4"). WithOwnerReference(metav1.OwnerReference{ - Name: "profile-slurm", + Name: "profile-slurm-c7rb4", APIVersion: "batch/v1", Kind: "Job", }). @@ -1082,8 +1100,11 @@ export $(cat /slurm/env/$JOB_CONTAINER_INDEX/slurm.env | xargs) } return m }), + cmpopts.SortSlices(func(a, b corev1.ConfigMap) bool { + return a.Name < b.Name + }), }, - wantOutPattern: `job\.batch\/.+ created\\nconfigmap\/.+ created`, + wantOutPattern: `^job\.batch\/.+ created\\nconfigmap\/.+ created\\nconfigmap\/.+ created\\nservice\/.+ created\\n`, }, "should create slurm with flags": { beforeTest: beforeSlurmTest, @@ -1176,11 +1197,18 @@ export $(cat /slurm/env/$JOB_CONTAINER_INDEX/slurm.env | xargs) WithVolumeMount(corev1.VolumeMount{Name: "slurm-env", MountPath: "/slurm/env"}). WithRequest(corev1.ResourceCPU, resource.MustParse("2")). Obj()). + WithEnvFrom(corev1.EnvFromSource{ + ConfigMapRef: &corev1.ConfigMapEnvSource{ + LocalObjectReference: corev1.LocalObjectReference{ + Name: "profile-slurm-c7rb4-env", + }, + }, + }). WithVolume(corev1.Volume{ Name: "slurm-scripts", VolumeSource: corev1.VolumeSource{ ConfigMap: &corev1.ConfigMapVolumeSource{ - LocalObjectReference: corev1.LocalObjectReference{Name: "profile-slurm"}, + LocalObjectReference: corev1.LocalObjectReference{Name: "profile-slurm-c7rb4-scripts"}, Items: []corev1.KeyToPath{ {Key: "init-entrypoint.sh", Path: "init-entrypoint.sh"}, {Key: "entrypoint.sh", Path: "entrypoint.sh"}, @@ -1210,15 +1238,14 @@ export $(cat /slurm/env/$JOB_CONTAINER_INDEX/slurm.env | xargs) &corev1.ConfigMapList{ TypeMeta: metav1.TypeMeta{Kind: "ConfigMapList", APIVersion: "v1"}, Items: []corev1.ConfigMap{ - *wrappers.MakeConfigMap("profile-slurm", metav1.NamespaceDefault). + *wrappers.MakeConfigMap("profile-slurm-c7rb4-scripts", metav1.NamespaceDefault). WithOwnerReference(metav1.OwnerReference{ - Name: "profile-slurm", + Name: "profile-slurm-c7rb4", APIVersion: "batch/v1", Kind: "Job", }). Profile("profile"). Mode(v1alpha1.SlurmMode). - LocalQueue("lq1"). Data(map[string]string{ "script": "#!/bin/bash\nsleep 300'", "init-entrypoint.sh": `#!/bin/sh @@ -1235,7 +1262,7 @@ set -x array_indexes="0,1,2;3,4,5;6,7,8;9,10,11;12,13,14;15,16,17;18,19,20;21,22,23;24,25" container_indexes=$(echo "$array_indexes" | awk -F';' -v idx="$JOB_COMPLETION_INDEX" '{print $((idx + 1))}') -for i in $(seq 0 3) +for i in $(seq 0 $SLURM_TASKS_PER_NODE) do container_index=$(echo "$container_indexes" | awk -F',' -v idx="$i" '{print $((idx + 1))}') @@ -1252,7 +1279,7 @@ do timeout=29 start_time=$(date +%s) while true; do - ip=$(nslookup "profile-slurm-r8njg-0.profile-slurm-r8njg" | grep "Address 1" | awk 'NR==2 {print $3}') || true + ip=$(nslookup "$SLURM_JOB_FIRST_NODE" | grep "Address 1" | awk 'NR==2 {print $3}') || true if [[ -n "$ip" ]]; then SLURM_JOB_FIRST_NODE_IP=$ip break @@ -1260,39 +1287,19 @@ do current_time=$(date +%s) elapsed_time=$((current_time - start_time)) if [ "$elapsed_time" -ge "$timeout" ]; then - echo "Timeout reached, IP address for the first node (profile-slurm-r8njg-0.profile-slurm-r8njg) not found." + echo "Timeout reached, IP address for the first node ($SLURM_JOB_FIRST_NODE) not found." break fi - echo "IP Address for the first node (profile-slurm-r8njg-0.profile-slurm-r8njg) not found, retrying..." + echo "IP Address for the first node ($SLURM_JOB_FIRST_NODE) not found, retrying..." sleep 1 fi done fi cat << EOF > /slurm/env/$i/slurm.env -SLURM_ARRAY_JOB_ID=1 -SLURM_ARRAY_TASK_COUNT=26 -SLURM_ARRAY_TASK_MAX=25 -SLURM_ARRAY_TASK_MIN=0 -SLURM_TASKS_PER_NODE=3 -SLURM_CPUS_PER_TASK=2 -SLURM_CPUS_ON_NODE=8 -SLURM_JOB_CPUS_PER_NODE=8 -SLURM_CPUS_PER_GPU= -SLURM_MEM_PER_CPU= -SLURM_MEM_PER_GPU= -SLURM_MEM_PER_NODE= -SLURM_GPUS= -SLURM_NTASKS=3 -SLURM_NTASKS_PER_NODE=3 -SLURM_NPROCS=3 -SLURM_NNODES=2 -SLURM_SUBMIT_DIR=/slurm/scripts SLURM_SUBMIT_HOST=$HOSTNAME -SLURM_JOB_NODELIST=profile-slurm-fpxnj-0.profile-slurm-fpxnj,profile-slurm-fpxnj-1.profile-slurm-fpxnj -SLURM_JOB_FIRST_NODE=profile-slurm-fpxnj-0.profile-slurm-fpxnj -SLURM_JOB_ID=$(expr $JOB_COMPLETION_INDEX \* 3 + $i + 1) -SLURM_JOBID=$(expr $JOB_COMPLETION_INDEX \* 3 + $i + 1) +SLURM_JOB_ID=$(expr $JOB_COMPLETION_INDEX \* $SLURM_TASKS_PER_NODE + $i + $SLURM_ARRAY_JOB_ID) +SLURM_JOBID=$(expr $JOB_COMPLETION_INDEX \* $SLURM_TASKS_PER_NODE + $i + $SLURM_ARRAY_JOB_ID) SLURM_ARRAY_TASK_ID=$container_index SLURM_JOB_FIRST_NODE_IP=${SLURM_JOB_FIRST_NODE_IP:-""} EOF @@ -1320,6 +1327,37 @@ export $(cat /slurm/env/$JOB_CONTAINER_INDEX/slurm.env | xargs)cd /mydir `, }). Obj(), + *wrappers.MakeConfigMap("profile-slurm-c7rb4-env", metav1.NamespaceDefault). + WithOwnerReference(metav1.OwnerReference{ + Name: "profile-slurm-c7rb4", + APIVersion: "batch/v1", + Kind: "Job", + }). + Profile("profile"). + Mode(v1alpha1.SlurmMode). + Data(map[string]string{ + "SLURM_ARRAY_JOB_ID": "1", + "SLURM_ARRAY_TASK_COUNT": "26", + "SLURM_ARRAY_TASK_MAX": "25", + "SLURM_ARRAY_TASK_MIN": "0", + "SLURM_TASKS_PER_NODE": "3", + "SLURM_CPUS_PER_TASK": "2", + "SLURM_CPUS_ON_NODE": "8", + "SLURM_JOB_CPUS_PER_NODE": "8", + "SLURM_CPUS_PER_GPU": "", + "SLURM_MEM_PER_CPU": "", + "SLURM_MEM_PER_GPU": "", + "SLURM_MEM_PER_NODE": "", + "SLURM_GPUS": "", + "SLURM_NTASKS": "3", + "SLURM_NTASKS_PER_NODE": "3", + "SLURM_NPROCS": "3", + "SLURM_NNODES": "2", + "SLURM_SUBMIT_DIR": "/slurm/scripts", + "SLURM_JOB_NODELIST": "profile-slurm-mwnv4-0.profile-slurm-mwnv4,profile-slurm-mwnv4-1.profile-slurm-mwnv4", + "SLURM_JOB_FIRST_NODE": "profile-slurm-mwnv4-0.profile-slurm-mwnv4", + }). + Obj(), }, }, &corev1.ServiceList{ @@ -1328,7 +1366,6 @@ export $(cat /slurm/env/$JOB_CONTAINER_INDEX/slurm.env | xargs)cd /mydir *wrappers.MakeService("profile-slurm", metav1.NamespaceDefault). Profile("profile"). Mode(v1alpha1.SlurmMode). - LocalQueue("lq1"). ClusterIP("None"). Selector("job-name", "profile-slurm"). WithOwnerReference(metav1.OwnerReference{ @@ -1350,8 +1387,11 @@ export $(cat /slurm/env/$JOB_CONTAINER_INDEX/slurm.env | xargs)cd /mydir } return m }), + cmpopts.SortSlices(func(a, b corev1.ConfigMap) bool { + return a.Name < b.Name + }), }, - wantOutPattern: `job\.batch\/.+ created\\nconfigmap\/.+ created`, + wantOutPattern: `^job\.batch\/.+ created\\nconfigmap\/.+ created\\nconfigmap\/.+ created\\nservice\/.+ created\\n`, }, "should create slurm with --ntasks flag": { beforeTest: beforeSlurmTest, @@ -1409,12 +1449,13 @@ export $(cat /slurm/env/$JOB_CONTAINER_INDEX/slurm.env | xargs)cd /mydir cmpopts.IgnoreFields(metav1.OwnerReference{}, "Name"), cmpopts.IgnoreFields(corev1.PodSpec{}, "InitContainers", "Subdomain"), cmpopts.IgnoreTypes([]corev1.EnvVar{}), + cmpopts.IgnoreTypes([]corev1.EnvFromSource{}), cmpopts.IgnoreTypes([]corev1.Volume{}), cmpopts.IgnoreTypes([]corev1.VolumeMount{}), cmpopts.IgnoreTypes(corev1.ConfigMapList{}), cmpopts.IgnoreTypes(corev1.ServiceList{}), }, - wantOutPattern: `job\.batch\/.+ created\\nconfigmap\/.+ created`, + wantOutPattern: `^job\.batch\/.+ created\\nconfigmap\/.+ created\\nconfigmap\/.+ created\\nservice\/.+ created\\n`, }, "should divide --mem exactly across containers": { beforeTest: beforeSlurmTest, @@ -1479,12 +1520,13 @@ export $(cat /slurm/env/$JOB_CONTAINER_INDEX/slurm.env | xargs)cd /mydir cmpopts.IgnoreFields(metav1.OwnerReference{}, "Name"), cmpopts.IgnoreFields(corev1.PodSpec{}, "InitContainers", "Subdomain"), cmpopts.IgnoreTypes([]corev1.EnvVar{}), + cmpopts.IgnoreTypes([]corev1.EnvFromSource{}), cmpopts.IgnoreTypes([]corev1.Volume{}), cmpopts.IgnoreTypes([]corev1.VolumeMount{}), cmpopts.IgnoreTypes(corev1.ConfigMapList{}), cmpopts.IgnoreTypes(corev1.ServiceList{}), }, - wantOutPattern: `job\.batch\/.+ created\\nconfigmap\/.+ created`, + wantOutPattern: `^job\.batch\/.+ created\\nconfigmap\/.+ created\\nconfigmap\/.+ created\\nservice\/.+ created\\n`, }, "should handle non-exact --mem division across containers": { beforeTest: beforeSlurmTest, @@ -1549,12 +1591,13 @@ export $(cat /slurm/env/$JOB_CONTAINER_INDEX/slurm.env | xargs)cd /mydir cmpopts.IgnoreFields(metav1.OwnerReference{}, "Name"), cmpopts.IgnoreFields(corev1.PodSpec{}, "InitContainers", "Subdomain"), cmpopts.IgnoreTypes([]corev1.EnvVar{}), + cmpopts.IgnoreTypes([]corev1.EnvFromSource{}), cmpopts.IgnoreTypes([]corev1.Volume{}), cmpopts.IgnoreTypes([]corev1.VolumeMount{}), cmpopts.IgnoreTypes(corev1.ConfigMapList{}), cmpopts.IgnoreTypes(corev1.ServiceList{}), }, - wantOutPattern: `job\.batch\/.+ created\\nconfigmap\/.+ created`, + wantOutPattern: `^job\.batch\/.+ created\\nconfigmap\/.+ created\\nconfigmap\/.+ created\\nservice\/.+ created\\n`, }, "should create slurm with --mem-per-cpu flag": { beforeTest: beforeSlurmTest, @@ -1622,12 +1665,13 @@ export $(cat /slurm/env/$JOB_CONTAINER_INDEX/slurm.env | xargs)cd /mydir cmpopts.IgnoreFields(metav1.OwnerReference{}, "Name"), cmpopts.IgnoreFields(corev1.PodSpec{}, "InitContainers", "Subdomain"), cmpopts.IgnoreTypes([]corev1.EnvVar{}), + cmpopts.IgnoreTypes([]corev1.EnvFromSource{}), cmpopts.IgnoreTypes([]corev1.Volume{}), cmpopts.IgnoreTypes([]corev1.VolumeMount{}), cmpopts.IgnoreTypes(corev1.ConfigMapList{}), cmpopts.IgnoreTypes(corev1.ServiceList{}), }, - wantOutPattern: `job\.batch\/.+ created\\nconfigmap\/.+ created`, + wantOutPattern: `^job\.batch\/.+ created\\nconfigmap\/.+ created\\nconfigmap\/.+ created\\nservice\/.+ created\\n`, }, "shouldn't create slurm with --mem-per-cpu flag because --cpus-per-task flag not specified": { beforeTest: beforeSlurmTest, @@ -1709,12 +1753,13 @@ export $(cat /slurm/env/$JOB_CONTAINER_INDEX/slurm.env | xargs)cd /mydir cmpopts.IgnoreFields(metav1.OwnerReference{}, "Name"), cmpopts.IgnoreFields(corev1.PodSpec{}, "InitContainers", "Subdomain"), cmpopts.IgnoreTypes([]corev1.EnvVar{}), + cmpopts.IgnoreTypes([]corev1.EnvFromSource{}), cmpopts.IgnoreTypes([]corev1.Volume{}), cmpopts.IgnoreTypes([]corev1.VolumeMount{}), cmpopts.IgnoreTypes(corev1.ConfigMapList{}), cmpopts.IgnoreTypes(corev1.ServiceList{}), }, - wantOutPattern: `job\.batch\/.+ created\\nconfigmap\/.+ created`, + wantOutPattern: `^job\.batch\/.+ created\\nconfigmap\/.+ created\\nconfigmap\/.+ created\\nservice\/.+ created\\n`, }, "should create slurm with --mem-per-gpu flag": { beforeTest: beforeSlurmTest, @@ -1784,12 +1829,13 @@ export $(cat /slurm/env/$JOB_CONTAINER_INDEX/slurm.env | xargs)cd /mydir cmpopts.IgnoreFields(metav1.OwnerReference{}, "Name"), cmpopts.IgnoreFields(corev1.PodSpec{}, "InitContainers", "Subdomain"), cmpopts.IgnoreTypes([]corev1.EnvVar{}), + cmpopts.IgnoreTypes([]corev1.EnvFromSource{}), cmpopts.IgnoreTypes([]corev1.Volume{}), cmpopts.IgnoreTypes([]corev1.VolumeMount{}), cmpopts.IgnoreTypes(corev1.ConfigMapList{}), cmpopts.IgnoreTypes(corev1.ServiceList{}), }, - wantOutPattern: `job\.batch\/.+ created\\nconfigmap\/.+ created`, + wantOutPattern: `^job\.batch\/.+ created\\nconfigmap\/.+ created\\nconfigmap\/.+ created\\nservice\/.+ created\\n`, }, "shouldn't create slurm with --mem-per-gpu flag because --gpus-per-task flag not specified": { beforeTest: beforeSlurmTest, @@ -1866,12 +1912,13 @@ export $(cat /slurm/env/$JOB_CONTAINER_INDEX/slurm.env | xargs)cd /mydir cmpopts.IgnoreFields(metav1.OwnerReference{}, "Name"), cmpopts.IgnoreFields(corev1.PodSpec{}, "InitContainers", "Subdomain"), cmpopts.IgnoreTypes([]corev1.EnvVar{}), + cmpopts.IgnoreTypes([]corev1.EnvFromSource{}), cmpopts.IgnoreTypes([]corev1.Volume{}), cmpopts.IgnoreTypes([]corev1.VolumeMount{}), cmpopts.IgnoreTypes(corev1.ConfigMapList{}), cmpopts.IgnoreTypes(corev1.ServiceList{}), }, - wantOutPattern: `job\.batch\/.+ created\\nconfigmap\/.+ created`, + wantOutPattern: `^job\.batch\/.+ created\\nconfigmap\/.+ created\\nconfigmap\/.+ created\\nservice\/.+ created\\n`, }, "shouldn't create job with client dry run": { args: func(tc *createCmdTestCase) []string { diff --git a/cmd/experimental/kjobctl/pkg/testing/wrappers/job_wrappers.go b/cmd/experimental/kjobctl/pkg/testing/wrappers/job_wrappers.go index 9c36b95145..9ef94b2791 100644 --- a/cmd/experimental/kjobctl/pkg/testing/wrappers/job_wrappers.go +++ b/cmd/experimental/kjobctl/pkg/testing/wrappers/job_wrappers.go @@ -155,6 +155,17 @@ func (j *JobWrapper) WithEnvVarIndexValue(name string) *JobWrapper { return j } +// WithEnvFrom add envFrom to the container template. +func (j *JobWrapper) WithEnvFrom(envFrom corev1.EnvFromSource) *JobWrapper { + for index := range j.Job.Spec.Template.Spec.Containers { + j.Job.Spec.Template.Spec.Containers[index].EnvFrom = append(j.Job.Spec.Template.Spec.Containers[index].EnvFrom, envFrom) + } + for index := range j.Job.Spec.Template.Spec.InitContainers { + j.Job.Spec.Template.Spec.InitContainers[index].EnvFrom = append(j.Job.Spec.Template.Spec.InitContainers[index].EnvFrom, envFrom) + } + return j +} + // RestartPolicy updates the restartPolicy on the pod template. func (j *JobWrapper) RestartPolicy(restartPolicy corev1.RestartPolicy) *JobWrapper { j.Job.Spec.Template.Spec.RestartPolicy = restartPolicy