@@ -6,17 +6,22 @@ namespace eval turbine {
6
6
7
7
proc job_srun_tcl { outputs inputs } {
8
8
set exit_code [ lindex $outputs 0 ]
9
- set cpj [ lindex $inputs 0 ]
10
- set ppj [ lindex $inputs 1 ]
11
- set cmd_line [ lindex $inputs 2 ]
12
- rule $inputs " turbine::job_srun_tcl_body $exit_code $cpj $ppj $cmd_line " \
9
+ set cpn [ lindex $inputs 0 ]
10
+ set cpj [ lindex $inputs 1 ]
11
+ set ppj [ lindex $inputs 2 ]
12
+ set bind [ lindex $inputs 3 ]
13
+ set cmd_line [ lindex $inputs 4 ]
14
+ rule $inputs \
15
+ " turbine::job_srun_tcl_body $exit_code $cpn $cpj $ppj $bind $cmd_line " \
13
16
type $turbine::WORK
14
17
}
15
18
16
- proc job_srun_tcl_body { exit_code cpj ppj cmd_line } {
19
+ proc job_srun_tcl_body { exit_code cpn cpj ppj bind cmd_line } {
17
20
# Retrieve data (decr?)
18
- set cpj_value [ retrieve_integer $cpj ]
19
- set ppj_value [ retrieve_integer $ppj ]
21
+ set cpn_value [ retrieve_integer $cpn ]
22
+ set cpj_value [ retrieve_integer $cpj ]
23
+ set ppj_value [ retrieve_integer $ppj ]
24
+ set bind_value [ retrieve_integer $bind ]
20
25
# Unpack command line
21
26
set D [ adlb::enumerate $cmd_line dict all 0 ]
22
27
set cmd_value [ list ]
@@ -25,27 +30,89 @@ namespace eval turbine {
25
30
lappend cmd_value [ dict get $D $k ]
26
31
}
27
32
# Run the user code
28
- set exit_code_value [ job_srun_impl $cpj_value $ppj_value $cmd_value ]
33
+ set exit_code_value \
34
+ [ job_srun_impl $cpn_value $cpj_value $ppj_value $bind_value $cmd_value ]
29
35
# Store result
30
36
store_integer $exit_code $exit_code_value
31
37
}
32
38
33
- proc job_srun_impl { cpj ppj cmd } {
39
+ proc job_srun_impl { cpn cpj ppj bind cmd } {
40
+ # Setup and run the job. Return a unix exit code.
41
+ global env
42
+ puts " turbine: srun: job_srun ..."
43
+
44
+ if $bind {
45
+ set cpu_bind [ bind_mask_cpu $cpn $cpj $ppj ]
46
+ } else {
47
+ set cpu_bind " "
48
+ }
49
+
50
+ puts " turbine: srun: job_srun -n $ppj -N 1 $cpu_bind $cmd "
51
+ puts " turbine: srun: in PWD: $env(PWD) "
34
52
try {
35
- puts " turbine: srun: exec: srun -n $ppj $cmd "
36
- set fp [ open " |srun -n $ppj $cmd " " r" ]
37
- show fp
53
+ # Run the user job! (with pipe to capture output)
54
+ set fp [ open " |srun -n $ppj -N 1 $cpu_bind $cmd 2>@1" " r" ]
38
55
while { [ gets $fp line ] >= 0 } {
39
56
puts " srun: $line "
40
57
}
41
58
close $fp
42
59
} on error e {
43
- puts " turbine: srun failed!"
44
- puts " turbine: srun error message begin:"
45
- puts $e
46
- puts " turbine: srun error message end."
60
+ job_srun_error $e
47
61
return 1
48
62
}
49
63
return 0
50
64
}
65
+
66
+ proc bind_mask_cpu { cpn cpj ppj } {
67
+ # Set up the SLURM cpu binding
68
+ global env
69
+ set cpu_bind " --cpu-bind=verbose,mask_cpu:"
70
+ set offset $env(ADLB_RANK_OFFSET)
71
+ set ppn $env(PPN)
72
+ # puts "offset=$offset ppn=$ppn cpn=$cpn"
73
+ show offset ppn cpn cpj ppj
74
+
75
+ set L [ list ]
76
+ set start [ expr $offset * $cpj ]
77
+ set spacing [ expr $cpj / $ppj ]
78
+ set cpj_max [ expr $cpn / $ppn ]
79
+ show cpj_max
80
+ set start [ expr $cpj_max * $offset ]
81
+ # set S1 [ contig $start $cpj_max ]
82
+ # show S1
83
+ set step [ expr $cpj_max / $cpj ]
84
+ set S2 [ contig $start $cpj $step ]
85
+ show step S2
86
+ set K [ fragment $S2 $ppj ]
87
+ show K
88
+
89
+ # set cpu_ids [ join $L "," ]
90
+ # append cpu_bind $cpu_ids
91
+ set masks [ list ]
92
+ foreach chunk $K {
93
+ set mask [ list2mask $chunk ]
94
+ show mask
95
+ lappend masks $mask
96
+ }
97
+ show masks
98
+ append cpu_bind [ join $masks " ," ]
99
+ return $cpu_bind
100
+ }
101
+
102
+ proc job_srun_error { e } {
103
+ puts " turbine: srun failed!"
104
+ puts " turbine: srun error message begin:"
105
+ puts $e
106
+ puts " turbine: srun error message end."
107
+ }
108
+
109
+ proc list2mask { L } {
110
+ set A 0
111
+ foreach i $L {
112
+ incr A [ expr 2 ** $i ]
113
+ }
114
+ puts $A
115
+ # printf "bitmap: %b" $A
116
+ return [ format " 0x%X" $A ]
117
+ }
51
118
}
0 commit comments