Skip to content

Commit

Permalink
Fix: Tolerate transient connection errors
Browse files Browse the repository at this point in the history
Nexus release process is absolutely critical and it seems just cannot
get our infra to work reliably. Work this around by ignoring
ConnectionErrors unless they pile up.

Issue: IT-25604
Change-Id: I3c035c280dd5ea973c188b78baec76e24aa1ec16
Signed-off-by: Robert Varga <nite@hq.sk>
rovarga committed Oct 31, 2023
1 parent 1eec6f1 commit 88b9444
Showing 1 changed file with 34 additions and 15 deletions.
49 changes: 34 additions & 15 deletions lftools/nexus/cmd.py
Original file line number Diff line number Diff line change
@@ -408,6 +408,16 @@ def add_str_if_not_exist(new_str, existing_str_lst):
return addthis


def find_release_time(events):
"""Returns the time when a repository was released, or None if it has not been released yet."""
for event in events:
name = event.find("name")
stopped = event.find("stopped")
if name.text == "release" and stopped is not None:
return stopped.text
return None


def release_staging_repos(repos, verify, nexus_url=""):
"""Release one or more staging repos.
@@ -504,22 +514,31 @@ def release_staging_repos(repos, verify, nexus_url=""):

# Hang out until the repo is fully released
log.info("Waiting for Nexus to complete releasing {}".format(str(repo)))
released = False
wait_seconds = 20
wait_iteration = 0
consecutive_failures = 0
activity_url = "{}/staging/repository/{}/activity".format(_nexus.baseurl, repo)
sleep(5) # Quick sleep to allow small repos to release.
while released is False:
response = requests.get(activity_url, auth=_nexus.auth).text
root = et.fromstring(response) # nosec
events = root.findall("./stagingActivity")
for event in events:
name = event.find("name")
stopped = event.find("stopped")
if name.text == "release" and stopped is not None:
log.info("Repo released at: {}".format(stopped.text))
released = True
if not released:
sleep(wait_seconds)
wait_iteration += 1
log.info("Still waiting... {:>4d} seconds gone".format(wait_seconds * wait_iteration))
while True:
try:
response = requests.get(activity_url, auth=_nexus.auth).text
consecutive_failures = 0
root = et.fromstring(response) # nosec
time = find_release_time(root.findall("./stagingActivity"))
if time is not None:
log.info("Repo released at: {}".format(time))
break

except requests.exceptions.ConnectionError as e:
# Ignore failures unless they pile up. We do this because we seem to be facing transient
# issues (like DNS failures) and completing repository release here is absolutely critical,
# as otherwise this can lead to failing to perform post-release steps, which cannot be
# manually recovered.
consecutive_failures += 1
if consecutive_failures > 50:
raise e
log.warn(e, stack_info=True, exc_info=True)

sleep(wait_seconds)
wait_iteration += 1
log.info("Still waiting... {:>4d} seconds gone".format(wait_seconds * wait_iteration))

0 comments on commit 88b9444

Please sign in to comment.