From a505c00688b111a51a7885297d16d07eac456400 Mon Sep 17 00:00:00 2001 From: gibsondan Date: Tue, 27 Feb 2024 15:53:09 -0600 Subject: [PATCH] always use an agent heartbeat timeout of 600 in CI/CD Summary: The current logic breaks down if you deploy to serverless and then redeploy with a code change in the middle of the initial deploy, which seems more likely and worth designing around than the (hopefully unlikely) case of the serverless agent going down and needing to wait a few more minutes to find out. Test Plan:Point a pex deploy at this image hash --- src/deploy.sh | 9 +-------- src/deploy_pex.py | 16 ++++++++-------- 2 files changed, 9 insertions(+), 16 deletions(-) diff --git a/src/deploy.sh b/src/deploy.sh index b85f74d9..86e66293 100755 --- a/src/deploy.sh +++ b/src/deploy.sh @@ -116,13 +116,6 @@ if [[ -z $PR_STATUS || "$PR_STATUS" == "OPEN" ]]; then echo "Deploying location ${INPUT_LOCATION_NAME} to deployment ${DEPLOYMENT_NAME}..." echo "deployment=${DEPLOYMENT_NAME}" >> ${GITHUB_OUTPUT} - # Extend timeout in case the agent is still spinning up - if [[ $CI_RUN_NUMBER -eq 1 ]]; then - AGENT_HEARTBEAT_TIMEOUT=600 - else - AGENT_HEARTBEAT_TIMEOUT=90 - fi - dagster-cloud workspace add-location \ --url "${DAGSTER_CLOUD_URL}/${DEPLOYMENT_NAME}" \ --api-token "$DAGSTER_CLOUD_API_TOKEN" \ @@ -130,7 +123,7 @@ if [[ -z $PR_STATUS || "$PR_STATUS" == "OPEN" ]]; then --location-name "${INPUT_LOCATION_NAME}" \ --image "${INPUT_REGISTRY}:${INPUT_IMAGE_TAG}" \ --location-load-timeout 3600 \ - --agent-heartbeat-timeout $AGENT_HEARTBEAT_TIMEOUT \ + --agent-heartbeat-timeout 600 \ --git-url "$COMMIT_URL" \ --commit-hash "$COMMIT_HASH" diff --git a/src/deploy_pex.py b/src/deploy_pex.py index 207b4243..dd4909ed 100755 --- a/src/deploy_pex.py +++ b/src/deploy_pex.py @@ -20,7 +20,9 @@ import yaml -DAGSTER_CLOUD_PEX_PATH = Path(__file__).parent.parent / "generated/gha/dagster-cloud.pex" +DAGSTER_CLOUD_PEX_PATH = ( + Path(__file__).parent.parent / "generated/gha/dagster-cloud.pex" +) UPDATE_COMMENT_SCRIPT_PATH = Path(__file__).parent / "create_or_update_comment.py" @@ -70,7 +72,9 @@ def get_locations(dagster_cloud_file) -> List[str]: workspace_contents = f.read() workspace_contents_yaml = yaml.safe_load(workspace_contents) - return [location["location_name"] for location in workspace_contents_yaml["locations"]] + return [ + location["location_name"] for location in workspace_contents_yaml["locations"] + ] def run(args): @@ -112,17 +116,13 @@ def deploy_pex(args, branch_deployment_name: Optional[str], build_method: str): args.insert(0, os.path.dirname(dagster_cloud_yaml)) args = args + [f"--build-method={build_method}"] commit_hash = os.getenv("GITHUB_SHA") - git_url = ( - f"{os.getenv('GITHUB_SERVER_URL')}/{os.getenv('GITHUB_REPOSITORY')}/tree/{commit_hash}" - ) + git_url = f"{os.getenv('GITHUB_SERVER_URL')}/{os.getenv('GITHUB_REPOSITORY')}/tree/{commit_hash}" deployment_name = branch_deployment_name if branch_deployment_name else "prod" deployment_flag = f"--url={os.getenv('DAGSTER_CLOUD_URL')}/{deployment_name}" locations = get_locations(dagster_cloud_yaml) - # give first deploy extra time to spin up agent - agent_heartbeat_timeout = 600 if (os.getenv("GITHUB_RUN_NUMBER") == "1") else 90 timeout_args = [ "--location-load-timeout=3600", - f"--agent-heartbeat-timeout={agent_heartbeat_timeout}", + "--agent-heartbeat-timeout=600", ] notify(branch_deployment_name, locations, "pending")