Skip to content

Commit

Permalink
add action of free up disk space (including move docker data directory)
Browse files Browse the repository at this point in the history
Signed-off-by: helenxie-bit <[email protected]>
  • Loading branch information
helenxie-bit committed Jan 24, 2025
1 parent b5cae0d commit a785d35
Show file tree
Hide file tree
Showing 4 changed files with 52 additions and 38 deletions.
24 changes: 0 additions & 24 deletions .github/workflows/e2e-test-tune-api.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -34,36 +34,12 @@ jobs:
echo "Checking disk space usage before e2e test..."
df -h # Run 'df' to check free disk space
- name: Monitor Memory Usage Before Run
if: always()
run: free -h

- name: Monitor Docker Container Memory Usage
if: always()
run: |
docker stats --no-stream
- name: Run e2e test with tune API
if: always()
uses: ./.github/workflows/template-e2e-test
with:
tune-api: true
training-operator: true

- name: Get YAML file of Experiment
if: always()
run: |
echo "Fetching the YAML file of the experiment..."
kubectl get experiment tune-example-2 -n default -o yaml
- name: Monitor Memory Usage After Run
if: always()
run: free -h

- name: Monitor Docker Container Memory Usage
if: always()
run: |
docker stats --no-stream

- name: Check Disk Space After Test
if: always() # Run this step even if previous steps fail
Expand Down
49 changes: 49 additions & 0 deletions .github/workflows/free-up-disk-space/action.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
name: Free-Up Disk Space
description: Remove Non-Essential Tools And Move Docker Data Directory to /mnt/docker

runs:
using: composite
steps:
# This step is a Workaround to avoid the "No space left on device" error.
# ref: https://github.com/actions/runner-images/issues/2840
- name: Remove unnecessary files
shell: bash
run: |
echo "Disk usage before cleanup:"
df -hT
sudo rm -rf /usr/share/dotnet
sudo rm -rf /opt/ghc
sudo rm -rf /usr/local/share/boost
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
sudo rm -rf /usr/local/lib/android
sudo rm -rf /usr/local/share/powershell
sudo rm -rf /usr/share/swift
echo "Disk usage after cleanup:"
df -hT
- name: Prune docker images
shell: bash
run: |
docker image prune -a -f
docker system df
df -hT
- name: Move docker data directory
shell: bash
run: |
echo "Stopping docker service ..."
sudo systemctl stop docker
DOCKER_DEFAULT_ROOT_DIR=/var/lib/docker
DOCKER_ROOT_DIR=/mnt/docker
echo "Moving ${DOCKER_DEFAULT_ROOT_DIR} -> ${DOCKER_ROOT_DIR}"
sudo mv ${DOCKER_DEFAULT_ROOT_DIR} ${DOCKER_ROOT_DIR}
echo "Creating symlink ${DOCKER_DEFAULT_ROOT_DIR} -> ${DOCKER_ROOT_DIR}"
sudo ln -s ${DOCKER_ROOT_DIR} ${DOCKER_DEFAULT_ROOT_DIR}
echo "$(sudo ls -l ${DOCKER_DEFAULT_ROOT_DIR})"
echo "Starting docker service ..."
sudo systemctl daemon-reload
sudo systemctl start docker
echo "Docker service status:"
sudo systemctl --no-pager -l -o short status docker
15 changes: 2 additions & 13 deletions .github/workflows/template-setup-e2e-test/action.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,19 +17,8 @@ runs:
steps:
# This step is a Workaround to avoid the "No space left on device" error.
# ref: https://github.com/actions/runner-images/issues/2840
- name: Remove unnecessary files
shell: bash
run: |
sudo rm -rf /usr/share/dotnet
sudo rm -rf /opt/ghc
sudo rm -rf "/usr/local/share/boost"
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
sudo rm -rf /usr/local/lib/android
sudo rm -rf /usr/local/share/powershell
sudo rm -rf /usr/share/swift
echo "Disk usage after cleanup:"
df -h
- name: Free-Up Disk Space
uses: ./.github/workflows/free-up-disk-space

- name: Setup kubectl
uses: azure/setup-kubectl@v4
Expand Down
2 changes: 1 addition & 1 deletion sdk/python/v1beta1/kubeflow/katib/api/katib_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -635,7 +635,7 @@ class name in this argument.

container_spec = training_utils.get_container_spec(
name=JOB_PARAMETERS[PYTORCHJOB_KIND]["container"],
base_image="docker.io/helenxiehz428/trainer:test",
base_image=TRAINER_TRANSFORMER_IMAGE,
args=[
"--model_uri",
model_provider_parameters.model_uri,
Expand Down

0 comments on commit a785d35

Please sign in to comment.