Skip to content

Commit 2c64799

Browse files
feat: adapt to the federated cluster (#2)
* feat: update IP of new federated cluster * feat: update module endpoints parsing in federated cluster * feat!: update naming in jobs and tasks * fix: change `NOMAD_TLS_SERVER_NAME` * fix: change Nomad cert paths
1 parent 5bf458c commit 2c64799

File tree

2 files changed

+33
-27
lines changed

2 files changed

+33
-27
lines changed

take_snapshot.py

+28-22
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ def get_deployment(
6464
# Create job info dict
6565
info = {
6666
'job_ID': j['ID'],
67+
'name': j['Name'],
6768
'status': '', # do not use j['Status'] as misleading
6869
'owner': j['Meta']['owner'],
6970
'title': j['Meta']['title'],
@@ -83,7 +84,7 @@ def get_deployment(
8384

8485
# Retrieve tasks
8586
tasks = j['TaskGroups'][0]['Tasks']
86-
usertask = [t for t in tasks if t['Name'] == 'usertask'][0]
87+
usertask = [t for t in tasks if t['Name'] == 'main'][0]
8788

8889
# Retrieve Docker image
8990
info['docker_image'] = usertask['Config']['image']
@@ -134,17 +135,6 @@ def get_deployment(
134135
except Exception: # return first endpoint
135136
info['main_endpoint'] = list(info['endpoints'].values())[0]
136137

137-
# Add active endpoints
138-
if full_info:
139-
info['active_endpoints'] = []
140-
for k, v in info['endpoints'].items():
141-
try:
142-
r = session.get(v, timeout=2)
143-
if r.status_code == 200:
144-
info['active_endpoints'].append(k)
145-
except requests.exceptions.Timeout:
146-
continue
147-
148138
# Only fill resources if the job is allocated
149139
allocs = Nomad.job.get_allocations(
150140
id_=j['ID'],
@@ -198,7 +188,7 @@ def get_deployment(
198188

199189
# Add error messages if needed
200190
if info['status'] == 'failed':
201-
info['error_msg'] = a['TaskStates']['usertask']['Events'][0]['Message']
191+
info['error_msg'] = a['TaskStates']['main']['Events'][0]['Message']
202192

203193
# Replace with clearer message
204194
if info['error_msg'] == 'Docker container exited with non-zero exit code: 1':
@@ -214,12 +204,8 @@ def get_deployment(
214204
"the network is restored and you should be able to fully recover " \
215205
"your deployment."
216206

217-
# Disable access to endpoints if there is a network cut
218-
if info['status'] == 'down' and info['active_endpoints']:
219-
info['active_endpoints'] = []
220-
221207
# Add resources
222-
res = a['AllocatedResources']['Tasks']['usertask']
208+
res = a['AllocatedResources']['Tasks']['main']
223209
gpu = [d for d in res['Devices'] if d['Type'] == 'gpu'][0] if res['Devices'] else None
224210
cpu_cores = res['Cpu']['ReservedCores']
225211
info['resources'] = {
@@ -230,6 +216,26 @@ def get_deployment(
230216
'disk_MB': a['AllocatedResources']['Shared']['DiskMB'],
231217
}
232218

219+
# Retrieve the node the jobs landed at in order to properly fill the endpoints
220+
n = Nomad.node.get_node(a['NodeID'])
221+
for k, v in info['endpoints'].items():
222+
info['endpoints'][k] = v.replace('${meta.domain}', n['Meta']['domain'])
223+
224+
# Add active endpoints
225+
if full_info:
226+
info['active_endpoints'] = []
227+
for k, v in info['endpoints'].items():
228+
try:
229+
r = session.get(v, timeout=2)
230+
if r.status_code == 200:
231+
info['active_endpoints'].append(k)
232+
except requests.exceptions.Timeout:
233+
continue
234+
235+
# Disable access to endpoints if there is a network cut
236+
if info['status'] == 'down' and info['active_endpoints']:
237+
info['active_endpoints'] = []
238+
233239
elif evals:
234240
# Something happened, job didn't deploy (eg. job needs port that's currently being used)
235241
# We have to return `placement failures message`.
@@ -259,8 +265,8 @@ def get_deployment(
259265

260266
# Add allocation start and end
261267
if allocs:
262-
info['alloc_start'] = a['TaskStates']['usertask']['StartedAt']
263-
info['alloc_end'] = a['TaskStates']['usertask']['FinishedAt']
268+
info['alloc_start'] = a['TaskStates']['main']['StartedAt']
269+
info['alloc_end'] = a['TaskStates']['main']['FinishedAt']
264270

265271
# Dead jobs should have dead state, otherwise status will be misleading (for example)
266272
if j['Status'] == 'dead':
@@ -286,7 +292,7 @@ def get_deployment(
286292

287293
# Skip jobs that do not start with userjob
288294
# (useful for admins who might have deployed other jobs eg. Traefik)
289-
if not j['Name'].startswith('userjob'):
295+
if not (j['Name'].startswith('module') or j['Name'].startswith('tool-fl')):
290296
continue
291297

292298
try:
@@ -297,7 +303,7 @@ def get_deployment(
297303
namespace=namespace,
298304
)
299305
)
300-
except Exception:
306+
except Exception as e:
301307
print(f" Failed to retrieve {j['ID']}")
302308

303309
# Save snapshot

take_snapshot.sh

+5-5
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,11 @@
55
# 0 */4 * * * /bin/bash /mnt/ai4-logs/ai4-accounting/take_snapshot.sh
66

77
# Export proper Nomad variables
8-
export NOMAD_ADDR=https://193.146.75.221:4646 # production cluster
9-
export NOMAD_CACERT=/home/ubuntu/nomad-certs/nomad-prod/nomad-ca.pem
10-
export NOMAD_CLIENT_CERT=/home/ubuntu/nomad-certs/nomad-prod/cli.pem
11-
export NOMAD_CLIENT_KEY=/home/ubuntu/nomad-certs/nomad-prod/cli-key.pem
12-
export NOMAD_TLS_SERVER_NAME=server.global.nomad
8+
export NOMAD_ADDR=https://193.146.75.205:4646 # production cluster
9+
export NOMAD_CACERT=/home/ubuntu/nomad-certs/nomad-federated/nomad-ca.pem
10+
export NOMAD_CLIENT_CERT=/home/ubuntu/nomad-certs/nomad-federated/cli.pem
11+
export NOMAD_CLIENT_KEY=/home/ubuntu/nomad-certs/nomad-federated/cli-key.pem
12+
export NOMAD_TLS_SERVER_NAME=node-ifca-0
1313

1414
# Move to main directory (where this script is located)
1515
cd $(dirname "$0")

0 commit comments

Comments
 (0)