@@ -64,6 +64,7 @@ def get_deployment(
64
64
# Create job info dict
65
65
info = {
66
66
'job_ID' : j ['ID' ],
67
+ 'name' : j ['Name' ],
67
68
'status' : '' , # do not use j['Status'] as misleading
68
69
'owner' : j ['Meta' ]['owner' ],
69
70
'title' : j ['Meta' ]['title' ],
@@ -83,7 +84,7 @@ def get_deployment(
83
84
84
85
# Retrieve tasks
85
86
tasks = j ['TaskGroups' ][0 ]['Tasks' ]
86
- usertask = [t for t in tasks if t ['Name' ] == 'usertask ' ][0 ]
87
+ usertask = [t for t in tasks if t ['Name' ] == 'main ' ][0 ]
87
88
88
89
# Retrieve Docker image
89
90
info ['docker_image' ] = usertask ['Config' ]['image' ]
@@ -134,17 +135,6 @@ def get_deployment(
134
135
except Exception : # return first endpoint
135
136
info ['main_endpoint' ] = list (info ['endpoints' ].values ())[0 ]
136
137
137
- # Add active endpoints
138
- if full_info :
139
- info ['active_endpoints' ] = []
140
- for k , v in info ['endpoints' ].items ():
141
- try :
142
- r = session .get (v , timeout = 2 )
143
- if r .status_code == 200 :
144
- info ['active_endpoints' ].append (k )
145
- except requests .exceptions .Timeout :
146
- continue
147
-
148
138
# Only fill resources if the job is allocated
149
139
allocs = Nomad .job .get_allocations (
150
140
id_ = j ['ID' ],
@@ -198,7 +188,7 @@ def get_deployment(
198
188
199
189
# Add error messages if needed
200
190
if info ['status' ] == 'failed' :
201
- info ['error_msg' ] = a ['TaskStates' ]['usertask ' ]['Events' ][0 ]['Message' ]
191
+ info ['error_msg' ] = a ['TaskStates' ]['main ' ]['Events' ][0 ]['Message' ]
202
192
203
193
# Replace with clearer message
204
194
if info ['error_msg' ] == 'Docker container exited with non-zero exit code: 1' :
@@ -214,12 +204,8 @@ def get_deployment(
214
204
"the network is restored and you should be able to fully recover " \
215
205
"your deployment."
216
206
217
- # Disable access to endpoints if there is a network cut
218
- if info ['status' ] == 'down' and info ['active_endpoints' ]:
219
- info ['active_endpoints' ] = []
220
-
221
207
# Add resources
222
- res = a ['AllocatedResources' ]['Tasks' ]['usertask ' ]
208
+ res = a ['AllocatedResources' ]['Tasks' ]['main ' ]
223
209
gpu = [d for d in res ['Devices' ] if d ['Type' ] == 'gpu' ][0 ] if res ['Devices' ] else None
224
210
cpu_cores = res ['Cpu' ]['ReservedCores' ]
225
211
info ['resources' ] = {
@@ -230,6 +216,26 @@ def get_deployment(
230
216
'disk_MB' : a ['AllocatedResources' ]['Shared' ]['DiskMB' ],
231
217
}
232
218
219
+ # Retrieve the node the jobs landed at in order to properly fill the endpoints
220
+ n = Nomad .node .get_node (a ['NodeID' ])
221
+ for k , v in info ['endpoints' ].items ():
222
+ info ['endpoints' ][k ] = v .replace ('${meta.domain}' , n ['Meta' ]['domain' ])
223
+
224
+ # Add active endpoints
225
+ if full_info :
226
+ info ['active_endpoints' ] = []
227
+ for k , v in info ['endpoints' ].items ():
228
+ try :
229
+ r = session .get (v , timeout = 2 )
230
+ if r .status_code == 200 :
231
+ info ['active_endpoints' ].append (k )
232
+ except requests .exceptions .Timeout :
233
+ continue
234
+
235
+ # Disable access to endpoints if there is a network cut
236
+ if info ['status' ] == 'down' and info ['active_endpoints' ]:
237
+ info ['active_endpoints' ] = []
238
+
233
239
elif evals :
234
240
# Something happened, job didn't deploy (eg. job needs port that's currently being used)
235
241
# We have to return `placement failures message`.
@@ -259,8 +265,8 @@ def get_deployment(
259
265
260
266
# Add allocation start and end
261
267
if allocs :
262
- info ['alloc_start' ] = a ['TaskStates' ]['usertask ' ]['StartedAt' ]
263
- info ['alloc_end' ] = a ['TaskStates' ]['usertask ' ]['FinishedAt' ]
268
+ info ['alloc_start' ] = a ['TaskStates' ]['main ' ]['StartedAt' ]
269
+ info ['alloc_end' ] = a ['TaskStates' ]['main ' ]['FinishedAt' ]
264
270
265
271
# Dead jobs should have dead state, otherwise status will be misleading (for example)
266
272
if j ['Status' ] == 'dead' :
@@ -286,7 +292,7 @@ def get_deployment(
286
292
287
293
# Skip jobs that do not start with userjob
288
294
# (useful for admins who might have deployed other jobs eg. Traefik)
289
- if not j ['Name' ].startswith ('userjob' ):
295
+ if not ( j ['Name' ].startswith ('module' ) or j [ 'Name' ]. startswith ( 'tool-fl' ) ):
290
296
continue
291
297
292
298
try :
@@ -297,7 +303,7 @@ def get_deployment(
297
303
namespace = namespace ,
298
304
)
299
305
)
300
- except Exception :
306
+ except Exception as e :
301
307
print (f" Failed to retrieve { j ['ID' ]} " )
302
308
303
309
# Save snapshot
0 commit comments