From 21120c673b915a618f4758df9dfcf37c517f9973 Mon Sep 17 00:00:00 2001
From: Allison Piper <alliepiper16@gmail.com>
Date: Thu, 23 May 2024 09:23:25 -0400
Subject: [PATCH] Fix_duplicate_job_checks (#1759)

* Fix up the final dispatch-job checks.

Comparing dispatch jobs worked before origin information was added, but now duplicate jobs with unique origins will not be caught.

* Test that duplicate jobs are removed.

* Revert "Test that duplicate jobs are removed." [skip-tests]

This reverts commit 1204aa2ad74ea9ac8681c2f5117e77a42fea1783.
---
 .../actions/workflow-build/build-workflow.py  | 85 +++++++++++++------
 1 file changed, 57 insertions(+), 28 deletions(-)

diff --git a/.github/actions/workflow-build/build-workflow.py b/.github/actions/workflow-build/build-workflow.py
index c71dd710d6f..70b1c691c61 100755
--- a/.github/actions/workflow-build/build-workflow.py
+++ b/.github/actions/workflow-build/build-workflow.py
@@ -361,9 +361,49 @@ def merge_dispatch_groups(accum_dispatch_groups, new_dispatch_groups):
                 accum_dispatch_groups[group_name][key] += value
 
 
+def compare_dispatch_jobs(job1, job2):
+    "Compare two dispatch job specs for equality. Considers only name/runner/image/command."
+    # Ignores the 'origin' key, which may vary between identical job specifications.
+    return (job1['name']    == job2['name'] and
+            job1['runner']  == job2['runner'] and
+            job1['image']   == job2['image'] and
+            job1['command'] == job2['command'])
+
+
+def dispatch_job_in_container(job, container):
+    "Check if a dispatch job is in a container, using compare_dispatch_jobs."
+    for job2 in container:
+        if compare_dispatch_jobs(job, job2):
+            return True
+    return False
+
+def remove_dispatch_job_from_container(job, container):
+    "Remove a dispatch job from a container, using compare_dispatch_jobs."
+    for i, job2 in enumerate(container):
+        if compare_dispatch_jobs(job, job2):
+            del container[i]
+            return True
+    return False
+
+
 def finalize_workflow_dispatch_groups(workflow_dispatch_groups_orig):
     workflow_dispatch_groups = copy.deepcopy(workflow_dispatch_groups_orig)
 
+    # Check to see if any .two_stage.producers arrays have more than 1 job, which is not supported.
+    # See ci-dispatch-two-stage.yml for details.
+    for group_name, group_json in workflow_dispatch_groups.items():
+        if 'two_stage' in group_json:
+            for two_stage_json in group_json['two_stage']:
+                num_producers = len(two_stage_json['producers'])
+                if num_producers > 1:
+                    producer_names = ""
+                    for job in two_stage_json['producers']:
+                        producer_names += f" - {job['name']}\n"
+                    error_message = f"ci-dispatch-two-stage.yml currently only supports a single producer. "
+                    error_message += f"Found {num_producers} producers in '{group_name}':\n{producer_names}"
+                    print(f"::error file=ci/matrix.yaml::{error_message}", file=sys.stderr)
+                    raise Exception(error_message)
+
     # Merge consumers for any two_stage arrays that have the same producer(s). Print a warning.
     for group_name, group_json in workflow_dispatch_groups.items():
         if not 'two_stage' in group_json:
@@ -374,12 +414,17 @@ def finalize_workflow_dispatch_groups(workflow_dispatch_groups_orig):
         for two_stage in two_stage_json:
             producers = two_stage['producers']
             consumers = two_stage['consumers']
-            if producers in merged_producers:
+
+            # Make sure this gets updated if we add support for multiple producers:
+            assert(len(producers) == 1)
+            producer = producers[0]
+
+            if dispatch_job_in_container(producer, merged_producers):
                 producer_index = merged_producers.index(producers)
                 matching_consumers = merged_consumers[producer_index]
 
-                producer_names = ", ".join([producer['name'] for producer in producers])
-                print(f"::notice file=ci/matrix.yaml::Merging consumers for duplicate producer '{producer_names}' in '{group_name}'",
+                producer_name = producer['name']
+                print(f"::notice file=ci/matrix.yaml::Merging consumers for duplicate producer '{producer_name}' in '{group_name}'",
                       file=sys.stderr)
                 consumer_names = ", ".join([consumer['name'] for consumer in matching_consumers])
                 print(f"::notice file=ci/matrix.yaml::Original consumers: {consumer_names}", file=sys.stderr)
@@ -387,17 +432,17 @@ def finalize_workflow_dispatch_groups(workflow_dispatch_groups_orig):
                 print(f"::notice file=ci/matrix.yaml::Duplicate consumers: {consumer_names}", file=sys.stderr)
                 # Merge if unique:
                 for consumer in consumers:
-                    if consumer not in matching_consumers:
+                    if not dispatch_job_in_container(consumer, matching_consumers):
                         matching_consumers.append(consumer)
                 consumer_names = ", ".join([consumer['name'] for consumer in matching_consumers])
                 print(f"::notice file=ci/matrix.yaml::Merged consumers: {consumer_names}", file=sys.stderr)
             else:
-                merged_producers.append(producers)
+                merged_producers.append(producer)
                 merged_consumers.append(consumers)
         # Update with the merged lists:
         two_stage_json = []
-        for producers, consumers in zip(merged_producers, merged_consumers):
-            two_stage_json.append({'producers': producers, 'consumers': consumers})
+        for producer, consumers in zip(merged_producers, merged_consumers):
+            two_stage_json.append({'producers': [producer], 'consumers': consumers})
         group_json['two_stage'] = two_stage_json
 
     # Check for any duplicate jobs in standalone arrays. Warn and remove duplicates.
@@ -405,7 +450,7 @@ def finalize_workflow_dispatch_groups(workflow_dispatch_groups_orig):
         standalone_jobs = group_json['standalone'] if 'standalone' in group_json else []
         unique_standalone_jobs = []
         for job_json in standalone_jobs:
-            if job_json in unique_standalone_jobs:
+            if dispatch_job_in_container(job_json, unique_standalone_jobs):
                 print(f"::notice file=ci/matrix.yaml::Removing duplicate standalone job '{job_json['name']}' in '{group_name}'",
                       file=sys.stderr)
             else:
@@ -415,25 +460,24 @@ def finalize_workflow_dispatch_groups(workflow_dispatch_groups_orig):
         two_stage_jobs = group_json['two_stage'] if 'two_stage' in group_json else []
         for two_stage_job in two_stage_jobs:
             for producer in two_stage_job['producers']:
-                if producer in unique_standalone_jobs:
+                if remove_dispatch_job_from_container(producer, unique_standalone_jobs):
                     print(f"::notice file=ci/matrix.yaml::Removing standalone job '{producer['name']}' " +
                           f"as it appears as a producer in '{group_name}'",
                           file=sys.stderr)
-                    unique_standalone_jobs.remove(producer)
             for consumer in two_stage_job['consumers']:
-                if consumer in unique_standalone_jobs:
+                if remove_dispatch_job_from_container(producer, unique_standalone_jobs):
                     print(f"::notice file=ci/matrix.yaml::Removing standalone job '{consumer['name']}' " +
                           f"as it appears as a consumer in '{group_name}'",
                           file=sys.stderr)
-                    unique_standalone_jobs.remove(consumer)
         standalone_jobs = list(unique_standalone_jobs)
+        group_json['standalone'] = standalone_jobs
 
         # If any producer or consumer job appears more than once, warn and leave as-is.
         all_two_stage_jobs = []
         duplicate_jobs = {}
         for two_stage_job in two_stage_jobs:
             for job in two_stage_job['producers'] + two_stage_job['consumers']:
-                if job in all_two_stage_jobs:
+                if dispatch_job_in_container(job, all_two_stage_jobs):
                     duplicate_jobs[job['name']] = duplicate_jobs.get(job['name'], 1) + 1
                 else:
                     all_two_stage_jobs.append(job)
@@ -467,21 +511,6 @@ def natural_sort_key(key):
             group_json['two_stage'] = sorted(
                 group_json['two_stage'], key=lambda x: natural_sort_key(x['producers'][0]['name']))
 
-    # Check to see if any .two_stage.producers arrays have more than 1 job, which is not supported.
-    # See ci-dispatch-two-stage.yml for details.
-    for group_name, group_json in workflow_dispatch_groups.items():
-        if 'two_stage' in group_json:
-            for two_stage_json in group_json['two_stage']:
-                num_producers = len(two_stage_json['producers'])
-                if num_producers > 1:
-                    producer_names = ""
-                    for job in two_stage_json['producers']:
-                        producer_names += f" - {job['name']}\n"
-                    error_message = f"ci-dispatch-two-stage.yml currently only supports a single producer. "
-                    error_message += f"Found {num_producers} producers in '{group_name}':\n{producer_names}"
-                    print(f"::error file=ci/matrix.yaml::{error_message}", file=sys.stderr)
-                    raise Exception(error_message)
-
     # Assign unique IDs in appropriate locations.
     # These are used to give "hidden" dispatch jobs a short, unique name,
     # otherwise GHA generates a long, cluttered name.