Skip to content

Commit 16df822

Browse files
committed
Ready to test job_srun() (ref #194)
1 parent ffaab7d commit 16df822

File tree

3 files changed

+114
-19
lines changed

3 files changed

+114
-19
lines changed

turbine/code/export/job.swift

+2-2
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,6 @@
33
// Various system-level configurations for application jobs.
44

55
@par @dispatch=WORKER
6-
(int status) job_srun(int cores_per_job, int procs_per_job,
7-
string cmd_line[])
6+
(int status) job_srun(int cores_per_node, int cores_per_job, int procs_per_job,
7+
boolean bind, string cmd_line[])
88
"turbine" "0.0" "job_srun_tcl";

turbine/code/lib/functions.tcl

+29-1
Original file line numberDiff line numberDiff line change
@@ -592,9 +592,37 @@ namespace eval turbine {
592592
}
593593
return $result
594594
}
595+
596+
proc contig { start count { step 1 } } {
597+
set result [ list ]
598+
set value $start
599+
for { set i 0 } { $i < $count } { incr i } {
600+
lappend result $value
601+
incr value $step
602+
}
603+
return $result
604+
}
605+
606+
# Break list L into count equal-size chunks (of size s)
607+
proc fragment { L count } {
608+
set result [ list ]
609+
set n [ llength $L ]
610+
set s [ expr $n / $count ]
611+
set index 0
612+
for { set c 0 } { $c < $count } { incr c } {
613+
set chunk [ list ]
614+
for { set i 0 } { $i < $s } { incr i } {
615+
lappend chunk [ lindex $L [ expr $index + $i ] ]
616+
}
617+
lappend result $chunk
618+
incr index $i
619+
}
620+
621+
return $result
622+
}
595623
}
596624

597625
# Local Variables:
598626
# mode: tcl
599-
# tcl-indent-level: 4
627+
# tcl-indent-level: 2
600628
# End:

turbine/code/lib/job.tcl

+83-16
Original file line numberDiff line numberDiff line change
@@ -6,17 +6,22 @@ namespace eval turbine {
66

77
proc job_srun_tcl { outputs inputs } {
88
set exit_code [ lindex $outputs 0 ]
9-
set cpj [ lindex $inputs 0 ]
10-
set ppj [ lindex $inputs 1 ]
11-
set cmd_line [ lindex $inputs 2 ]
12-
rule $inputs "turbine::job_srun_tcl_body $exit_code $cpj $ppj $cmd_line" \
9+
set cpn [ lindex $inputs 0 ]
10+
set cpj [ lindex $inputs 1 ]
11+
set ppj [ lindex $inputs 2 ]
12+
set bind [ lindex $inputs 3 ]
13+
set cmd_line [ lindex $inputs 4 ]
14+
rule $inputs \
15+
"turbine::job_srun_tcl_body $exit_code $cpn $cpj $ppj $bind $cmd_line" \
1316
type $turbine::WORK
1417
}
1518

16-
proc job_srun_tcl_body { exit_code cpj ppj cmd_line } {
19+
proc job_srun_tcl_body { exit_code cpn cpj ppj bind cmd_line } {
1720
# Retrieve data (decr?)
18-
set cpj_value [ retrieve_integer $cpj ]
19-
set ppj_value [ retrieve_integer $ppj ]
21+
set cpn_value [ retrieve_integer $cpn ]
22+
set cpj_value [ retrieve_integer $cpj ]
23+
set ppj_value [ retrieve_integer $ppj ]
24+
set bind_value [ retrieve_integer $bind ]
2025
# Unpack command line
2126
set D [ adlb::enumerate $cmd_line dict all 0 ]
2227
set cmd_value [ list ]
@@ -25,27 +30,89 @@ namespace eval turbine {
2530
lappend cmd_value [ dict get $D $k ]
2631
}
2732
# Run the user code
28-
set exit_code_value [ job_srun_impl $cpj_value $ppj_value $cmd_value ]
33+
set exit_code_value \
34+
[ job_srun_impl $cpn_value $cpj_value $ppj_value $bind_value $cmd_value ]
2935
# Store result
3036
store_integer $exit_code $exit_code_value
3137
}
3238

33-
proc job_srun_impl { cpj ppj cmd } {
39+
proc job_srun_impl { cpn cpj ppj bind cmd } {
40+
# Setup and run the job. Return a unix exit code.
41+
global env
42+
puts "turbine: srun: job_srun ..."
43+
44+
if $bind {
45+
set cpu_bind [ bind_mask_cpu $cpn $cpj $ppj ]
46+
} else {
47+
set cpu_bind ""
48+
}
49+
50+
puts "turbine: srun: job_srun -n $ppj -N 1 $cpu_bind $cmd"
51+
puts "turbine: srun: in PWD: $env(PWD)"
3452
try {
35-
puts "turbine: srun: exec: srun -n $ppj $cmd"
36-
set fp [ open "|srun -n $ppj $cmd" "r" ]
37-
show fp
53+
# Run the user job! (with pipe to capture output)
54+
set fp [ open "|srun -n $ppj -N 1 $cpu_bind $cmd 2>@1" "r" ]
3855
while { [ gets $fp line ] >= 0 } {
3956
puts "srun: $line"
4057
}
4158
close $fp
4259
} on error e {
43-
puts "turbine: srun failed!"
44-
puts "turbine: srun error message begin:"
45-
puts $e
46-
puts "turbine: srun error message end."
60+
job_srun_error $e
4761
return 1
4862
}
4963
return 0
5064
}
65+
66+
proc bind_mask_cpu { cpn cpj ppj } {
67+
# Set up the SLURM cpu binding
68+
global env
69+
set cpu_bind "--cpu-bind=verbose,mask_cpu:"
70+
set offset $env(ADLB_RANK_OFFSET)
71+
set ppn $env(PPN)
72+
# puts "offset=$offset ppn=$ppn cpn=$cpn"
73+
show offset ppn cpn cpj ppj
74+
75+
set L [ list ]
76+
set start [ expr $offset * $cpj ]
77+
set spacing [ expr $cpj / $ppj ]
78+
set cpj_max [ expr $cpn / $ppn ]
79+
show cpj_max
80+
set start [ expr $cpj_max * $offset ]
81+
# set S1 [ contig $start $cpj_max ]
82+
# show S1
83+
set step [ expr $cpj_max / $cpj ]
84+
set S2 [ contig $start $cpj $step ]
85+
show step S2
86+
set K [ fragment $S2 $ppj ]
87+
show K
88+
89+
# set cpu_ids [ join $L "," ]
90+
# append cpu_bind $cpu_ids
91+
set masks [ list ]
92+
foreach chunk $K {
93+
set mask [ list2mask $chunk ]
94+
show mask
95+
lappend masks $mask
96+
}
97+
show masks
98+
append cpu_bind [ join $masks "," ]
99+
return $cpu_bind
100+
}
101+
102+
proc job_srun_error { e } {
103+
puts "turbine: srun failed!"
104+
puts "turbine: srun error message begin:"
105+
puts $e
106+
puts "turbine: srun error message end."
107+
}
108+
109+
proc list2mask { L } {
110+
set A 0
111+
foreach i $L {
112+
incr A [ expr 2 ** $i ]
113+
}
114+
puts $A
115+
# printf "bitmap: %b" $A
116+
return [ format "0x%X" $A ]
117+
}
51118
}

0 commit comments

Comments
 (0)