From a785d353a1feb1f3959a2bbd306269585ed2d207 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Fri, 24 Jan 2025 14:42:59 -0800 Subject: [PATCH] add action of free up disk space (including move docker data directory) Signed-off-by: helenxie-bit --- .github/workflows/e2e-test-tune-api.yaml | 24 --------- .../workflows/free-up-disk-space/action.yaml | 49 +++++++++++++++++++ .../template-setup-e2e-test/action.yaml | 15 +----- .../kubeflow/katib/api/katib_client.py | 2 +- 4 files changed, 52 insertions(+), 38 deletions(-) create mode 100644 .github/workflows/free-up-disk-space/action.yaml diff --git a/.github/workflows/e2e-test-tune-api.yaml b/.github/workflows/e2e-test-tune-api.yaml index 86cb786647f..54550b03542 100644 --- a/.github/workflows/e2e-test-tune-api.yaml +++ b/.github/workflows/e2e-test-tune-api.yaml @@ -34,36 +34,12 @@ jobs: echo "Checking disk space usage before e2e test..." df -h # Run 'df' to check free disk space - - name: Monitor Memory Usage Before Run - if: always() - run: free -h - - - name: Monitor Docker Container Memory Usage - if: always() - run: | - docker stats --no-stream - - name: Run e2e test with tune API if: always() uses: ./.github/workflows/template-e2e-test with: tune-api: true training-operator: true - - - name: Get YAML file of Experiment - if: always() - run: | - echo "Fetching the YAML file of the experiment..." - kubectl get experiment tune-example-2 -n default -o yaml - - - name: Monitor Memory Usage After Run - if: always() - run: free -h - - - name: Monitor Docker Container Memory Usage - if: always() - run: | - docker stats --no-stream - name: Check Disk Space After Test if: always() # Run this step even if previous steps fail diff --git a/.github/workflows/free-up-disk-space/action.yaml b/.github/workflows/free-up-disk-space/action.yaml new file mode 100644 index 00000000000..110e3a21b84 --- /dev/null +++ b/.github/workflows/free-up-disk-space/action.yaml @@ -0,0 +1,49 @@ +name: Free-Up Disk Space +description: Remove Non-Essential Tools And Move Docker Data Directory to /mnt/docker + +runs: + using: composite + steps: + # This step is a Workaround to avoid the "No space left on device" error. + # ref: https://github.com/actions/runner-images/issues/2840 + - name: Remove unnecessary files + shell: bash + run: | + echo "Disk usage before cleanup:" + df -hT + + sudo rm -rf /usr/share/dotnet + sudo rm -rf /opt/ghc + sudo rm -rf /usr/local/share/boost + sudo rm -rf "$AGENT_TOOLSDIRECTORY" + sudo rm -rf /usr/local/lib/android + sudo rm -rf /usr/local/share/powershell + sudo rm -rf /usr/share/swift + + echo "Disk usage after cleanup:" + df -hT + + - name: Prune docker images + shell: bash + run: | + docker image prune -a -f + docker system df + df -hT + + - name: Move docker data directory + shell: bash + run: | + echo "Stopping docker service ..." + sudo systemctl stop docker + DOCKER_DEFAULT_ROOT_DIR=/var/lib/docker + DOCKER_ROOT_DIR=/mnt/docker + echo "Moving ${DOCKER_DEFAULT_ROOT_DIR} -> ${DOCKER_ROOT_DIR}" + sudo mv ${DOCKER_DEFAULT_ROOT_DIR} ${DOCKER_ROOT_DIR} + echo "Creating symlink ${DOCKER_DEFAULT_ROOT_DIR} -> ${DOCKER_ROOT_DIR}" + sudo ln -s ${DOCKER_ROOT_DIR} ${DOCKER_DEFAULT_ROOT_DIR} + echo "$(sudo ls -l ${DOCKER_DEFAULT_ROOT_DIR})" + echo "Starting docker service ..." + sudo systemctl daemon-reload + sudo systemctl start docker + echo "Docker service status:" + sudo systemctl --no-pager -l -o short status docker \ No newline at end of file diff --git a/.github/workflows/template-setup-e2e-test/action.yaml b/.github/workflows/template-setup-e2e-test/action.yaml index 561f127648a..f85697fe3d5 100644 --- a/.github/workflows/template-setup-e2e-test/action.yaml +++ b/.github/workflows/template-setup-e2e-test/action.yaml @@ -17,19 +17,8 @@ runs: steps: # This step is a Workaround to avoid the "No space left on device" error. # ref: https://github.com/actions/runner-images/issues/2840 - - name: Remove unnecessary files - shell: bash - run: | - sudo rm -rf /usr/share/dotnet - sudo rm -rf /opt/ghc - sudo rm -rf "/usr/local/share/boost" - sudo rm -rf "$AGENT_TOOLSDIRECTORY" - sudo rm -rf /usr/local/lib/android - sudo rm -rf /usr/local/share/powershell - sudo rm -rf /usr/share/swift - - echo "Disk usage after cleanup:" - df -h + - name: Free-Up Disk Space + uses: ./.github/workflows/free-up-disk-space - name: Setup kubectl uses: azure/setup-kubectl@v4 diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index 82a3712cda1..b641800290f 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -635,7 +635,7 @@ class name in this argument. container_spec = training_utils.get_container_spec( name=JOB_PARAMETERS[PYTORCHJOB_KIND]["container"], - base_image="docker.io/helenxiehz428/trainer:test", + base_image=TRAINER_TRANSFORMER_IMAGE, args=[ "--model_uri", model_provider_parameters.model_uri,