From 6be7f29582a114cc01b6e92b485f04c4dfddd5c1 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Tue, 3 Sep 2024 21:10:31 +0800 Subject: [PATCH 01/57] add e2e test for tune api Signed-off-by: helenxie-bit --- .github/workflows/e2e-test-tune-api.yaml | 5 + .../scripts/gh-actions/run-e2e-tune-api.py | 92 ++++++++++++++++++- 2 files changed, 94 insertions(+), 3 deletions(-) diff --git a/.github/workflows/e2e-test-tune-api.yaml b/.github/workflows/e2e-test-tune-api.yaml index e1f37a3701b..6ac0c6e0dfc 100644 --- a/.github/workflows/e2e-test-tune-api.yaml +++ b/.github/workflows/e2e-test-tune-api.yaml @@ -22,10 +22,15 @@ jobs: with: kubernetes-version: ${{ matrix.kubernetes-version }} + - name: Install Training Operator SDK + shell: bash + run: pip install kubeflow-training[huggingface] + - name: Run e2e test with tune API uses: ./.github/workflows/template-e2e-test with: tune-api: true + training-operator: true strategy: fail-fast: false diff --git a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py index c9d1cb2ee43..944def8cd36 100644 --- a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py +++ b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py @@ -1,8 +1,15 @@ import argparse import logging -from kubeflow.katib import KatibClient, search +import transformers +from kubeflow.katib import KatibClient, search, types +from kubeflow.storage_initializer.hugging_face import ( + HuggingFaceDatasetParams, + HuggingFaceModelParams, + HuggingFaceTrainerParams, +) from kubernetes import client +from peft import LoraConfig from verify import verify_experiment_results # Experiment timeout is 40 min. @@ -12,7 +19,8 @@ logging.basicConfig(level=logging.INFO) -def run_e2e_experiment_create_by_tune( +# Test for Experiment created with custom objective. +def run_e2e_experiment_create_by_tune_with_custom_objective( katib_client: KatibClient, exp_name: str, exp_namespace: str, @@ -57,6 +65,70 @@ def objective(parameters): logging.debug(katib_client.get_experiment(exp_name, exp_namespace)) logging.debug(katib_client.get_suggestion(exp_name, exp_namespace)) +# Test for Experiment created with external models and datasets. +def run_e2e_experiment_create_by_tune_with_external_model( + katib_client: KatibClient, + exp_name: str, + exp_namespace: str, +): + # Create Katib Experiment and wait until it is finished. + logging.debug("Creating Experiment: {}/{}".format(exp_namespace, exp_name)) + + # Use the test case from fine-tuning API tutorial. + # https://www.kubeflow.org/docs/components/training/user-guides/fine-tuning/ + # Create Katib Experiment. + # And Wait until Experiment reaches Succeeded condition. + katib_client.tune( + name=exp_name, + namespace=exp_namespace, + # BERT model URI and type of Transformer to train it. + model_provider_parameters=HuggingFaceModelParams( + model_uri="hf://google-bert/bert-base-cased", + transformer_type=transformers.AutoModelForSequenceClassification, + num_labels=5, + ), + # In order to save test time, use 8 samples from Yelp dataset. + dataset_provider_parameters=HuggingFaceDatasetParams( + repo_id="yelp_review_full", + split="train[:8]", + ), + # Specify HuggingFace Trainer parameters. + trainer_parameters=HuggingFaceTrainerParams( + training_parameters=transformers.TrainingArguments( + output_dir="test_tune_api", + save_strategy="no", + learning_rate = search.double(min=1e-05, max=5e-05), + num_train_epochs=1, + ), + # Set LoRA config to reduce number of trainable model parameters. + lora_config=LoraConfig( + r = search.int(min=8, max=32), + lora_alpha=8, + lora_dropout=0.1, + bias="none", + ), + ), + objective_metric_name = "train_loss", + objective_type = "minimize", + algorithm_name = "random", + max_trial_count = 1, + parallel_trial_count = 1, + resources_per_trial=types.TrainerResources( + num_workers=1, + num_procs_per_worker=1, + resources_per_worker={"cpu": "2", "memory": "10G",}, + ), + ) + experiment = katib_client.wait_for_experiment_condition( + exp_name, exp_namespace, timeout=EXPERIMENT_TIMEOUT + ) + + # Verify the Experiment results. + verify_experiment_results(katib_client, experiment, exp_name, exp_namespace) + + # Print the Experiment and Suggestion. + logging.debug(katib_client.get_experiment(exp_name, exp_namespace)) + logging.debug(katib_client.get_suggestion(exp_name, exp_namespace)) if __name__ == "__main__": parser = argparse.ArgumentParser() @@ -82,7 +154,21 @@ def objective(parameters): exp_name = "tune-example" exp_namespace = args.namespace try: - run_e2e_experiment_create_by_tune(katib_client, exp_name, exp_namespace) + run_e2e_experiment_create_by_tune_with_custom_objective(katib_client, exp_name, exp_namespace) + logging.info("---------------------------------------------------------------") + logging.info(f"E2E is succeeded for Experiment created by tune: {exp_namespace}/{exp_name}") + except Exception as e: + logging.info("---------------------------------------------------------------") + logging.info(f"E2E is failed for Experiment created by tune: {exp_namespace}/{exp_name}") + raise e + finally: + # Delete the Experiment. + logging.info("---------------------------------------------------------------") + logging.info("---------------------------------------------------------------") + katib_client.delete_experiment(exp_name, exp_namespace) + + try: + run_e2e_experiment_create_by_tune_with_external_model(katib_client, exp_name, exp_namespace) logging.info("---------------------------------------------------------------") logging.info(f"E2E is succeeded for Experiment created by tune: {exp_namespace}/{exp_name}") except Exception as e: From 1a1f119a1ff5c9f49eaa3f5b2f9b23d10a1fa1aa Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Tue, 3 Sep 2024 21:38:10 +0800 Subject: [PATCH 02/57] upgrade training-operator sdk Signed-off-by: helenxie-bit --- .github/workflows/e2e-test-tune-api.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/e2e-test-tune-api.yaml b/.github/workflows/e2e-test-tune-api.yaml index 6ac0c6e0dfc..2c45d75f99b 100644 --- a/.github/workflows/e2e-test-tune-api.yaml +++ b/.github/workflows/e2e-test-tune-api.yaml @@ -24,7 +24,7 @@ jobs: - name: Install Training Operator SDK shell: bash - run: pip install kubeflow-training[huggingface] + run: pip install -U kubeflow-training[huggingface] - name: Run e2e test with tune API uses: ./.github/workflows/template-e2e-test From 8461a49230b240dea4cfaaf0280bbe356d091385 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Tue, 3 Sep 2024 21:50:06 +0800 Subject: [PATCH 03/57] specify the version of training operator sdk Signed-off-by: helenxie-bit --- .github/workflows/e2e-test-tune-api.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/e2e-test-tune-api.yaml b/.github/workflows/e2e-test-tune-api.yaml index 2c45d75f99b..c631ad9420b 100644 --- a/.github/workflows/e2e-test-tune-api.yaml +++ b/.github/workflows/e2e-test-tune-api.yaml @@ -24,7 +24,7 @@ jobs: - name: Install Training Operator SDK shell: bash - run: pip install -U kubeflow-training[huggingface] + run: pip install kubeflow-training[huggingface]==1.8.0 - name: Run e2e test with tune API uses: ./.github/workflows/template-e2e-test From c860238525aba14dcb57e207e1449af7ade0735b Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Tue, 3 Sep 2024 22:16:45 +0800 Subject: [PATCH 04/57] fix num_labels error and update the version of training operator controller Signed-off-by: helenxie-bit --- .github/workflows/e2e-test-tune-api.yaml | 2 +- sdk/python/v1beta1/kubeflow/katib/api/katib_client.py | 2 ++ test/e2e/v1beta1/scripts/gh-actions/setup-katib.sh | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/workflows/e2e-test-tune-api.yaml b/.github/workflows/e2e-test-tune-api.yaml index c631ad9420b..2c45d75f99b 100644 --- a/.github/workflows/e2e-test-tune-api.yaml +++ b/.github/workflows/e2e-test-tune-api.yaml @@ -24,7 +24,7 @@ jobs: - name: Install Training Operator SDK shell: bash - run: pip install kubeflow-training[huggingface]==1.8.0 + run: pip install -U kubeflow-training[huggingface] - name: Run e2e test with tune API uses: ./.github/workflows/template-e2e-test diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index 05fd1405a3f..49c5d88e584 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -633,6 +633,8 @@ class name in this argument. model_provider_parameters.model_uri, "--transformer_type", model_provider_parameters.transformer_type.__name__, + "--num_labels", + str(model_provider_parameters.num_labels), "--model_dir", VOLUME_PATH_MODEL, "--dataset_dir", diff --git a/test/e2e/v1beta1/scripts/gh-actions/setup-katib.sh b/test/e2e/v1beta1/scripts/gh-actions/setup-katib.sh index d0b05caf712..68f5e6d1a5d 100755 --- a/test/e2e/v1beta1/scripts/gh-actions/setup-katib.sh +++ b/test/e2e/v1beta1/scripts/gh-actions/setup-katib.sh @@ -25,7 +25,7 @@ DEPLOY_TRAINING_OPERATOR=${2:-false} WITH_DATABASE_TYPE=${3:-mysql} E2E_TEST_IMAGE_TAG="e2e-test" -TRAINING_OPERATOR_VERSION="v1.6.0-rc.0" +TRAINING_OPERATOR_VERSION="v1.8.0" echo "Start to install Katib" From 216ebd9a4411815dadc1134882c1acdd8be203d0 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Tue, 3 Sep 2024 22:30:39 +0800 Subject: [PATCH 05/57] check the version of training operator Signed-off-by: helenxie-bit --- .github/workflows/e2e-test-tune-api.yaml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/e2e-test-tune-api.yaml b/.github/workflows/e2e-test-tune-api.yaml index 2c45d75f99b..a7609e8c47d 100644 --- a/.github/workflows/e2e-test-tune-api.yaml +++ b/.github/workflows/e2e-test-tune-api.yaml @@ -24,7 +24,9 @@ jobs: - name: Install Training Operator SDK shell: bash - run: pip install -U kubeflow-training[huggingface] + run: | + pip install -U kubeflow-training[huggingface] + pip show kubeflow-training | grep Version - name: Run e2e test with tune API uses: ./.github/workflows/template-e2e-test From f6b96f5e10708e18dff97c36f638ddef3b04ed73 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Tue, 3 Sep 2024 22:55:27 +0800 Subject: [PATCH 06/57] debug Signed-off-by: helenxie-bit --- test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py index 944def8cd36..b339f0458e5 100644 --- a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py +++ b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py @@ -10,6 +10,7 @@ ) from kubernetes import client from peft import LoraConfig +from typing import get_type_hints from verify import verify_experiment_results # Experiment timeout is 40 min. @@ -71,6 +72,10 @@ def run_e2e_experiment_create_by_tune_with_external_model( exp_name: str, exp_namespace: str, ): + # Debugging: Print the module and annotations of HuggingFaceModelParams + print("HuggingFaceModelParams is defined in module:", HuggingFaceModelParams.__module__) + print("HuggingFaceModelParams annotations:", get_type_hints(HuggingFaceModelParams)) + # Create Katib Experiment and wait until it is finished. logging.debug("Creating Experiment: {}/{}".format(exp_namespace, exp_name)) From c6364932778bf632bc3ebbfd4ae6f71c2cc520ac Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Tue, 3 Sep 2024 23:15:21 +0800 Subject: [PATCH 07/57] check import path of HuggingFaceModelParams Signed-off-by: helenxie-bit --- test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py index b339f0458e5..0312d52b902 100644 --- a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py +++ b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py @@ -10,6 +10,7 @@ ) from kubernetes import client from peft import LoraConfig +import sys from typing import get_type_hints from verify import verify_experiment_results @@ -75,6 +76,7 @@ def run_e2e_experiment_create_by_tune_with_external_model( # Debugging: Print the module and annotations of HuggingFaceModelParams print("HuggingFaceModelParams is defined in module:", HuggingFaceModelParams.__module__) print("HuggingFaceModelParams annotations:", get_type_hints(HuggingFaceModelParams)) + print(sys.modules['kubeflow'].HuggingFaceModelParams.__file__) # Create Katib Experiment and wait until it is finished. logging.debug("Creating Experiment: {}/{}".format(exp_namespace, exp_name)) From 8180422d050129d72bb80cf752cdb37db2c8aac2 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Thu, 5 Sep 2024 14:41:36 +0800 Subject: [PATCH 08/57] update the version of training operator sdk Signed-off-by: helenxie-bit --- .github/workflows/e2e-test-tune-api.yaml | 4 ++-- .../e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py | 10 ++-------- test/e2e/v1beta1/scripts/gh-actions/setup-katib.sh | 2 +- 3 files changed, 5 insertions(+), 11 deletions(-) diff --git a/.github/workflows/e2e-test-tune-api.yaml b/.github/workflows/e2e-test-tune-api.yaml index a7609e8c47d..e72e6f6ef9b 100644 --- a/.github/workflows/e2e-test-tune-api.yaml +++ b/.github/workflows/e2e-test-tune-api.yaml @@ -25,8 +25,8 @@ jobs: - name: Install Training Operator SDK shell: bash run: | - pip install -U kubeflow-training[huggingface] - pip show kubeflow-training | grep Version + pip install git+https://github.com/kubeflow/training-operator.git@v1.8-branch#subdirectory=sdk/python + pip install peft==0.3.0 datasets==2.15.0 transformers==4.38.0 - name: Run e2e test with tune API uses: ./.github/workflows/template-e2e-test diff --git a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py index 0312d52b902..10bc75b41e6 100644 --- a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py +++ b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py @@ -1,6 +1,7 @@ import argparse import logging +import kubeflow.katib as katib import transformers from kubeflow.katib import KatibClient, search, types from kubeflow.storage_initializer.hugging_face import ( @@ -10,8 +11,6 @@ ) from kubernetes import client from peft import LoraConfig -import sys -from typing import get_type_hints from verify import verify_experiment_results # Experiment timeout is 40 min. @@ -73,11 +72,6 @@ def run_e2e_experiment_create_by_tune_with_external_model( exp_name: str, exp_namespace: str, ): - # Debugging: Print the module and annotations of HuggingFaceModelParams - print("HuggingFaceModelParams is defined in module:", HuggingFaceModelParams.__module__) - print("HuggingFaceModelParams annotations:", get_type_hints(HuggingFaceModelParams)) - print(sys.modules['kubeflow'].HuggingFaceModelParams.__file__) - # Create Katib Experiment and wait until it is finished. logging.debug("Creating Experiment: {}/{}".format(exp_namespace, exp_name)) @@ -120,7 +114,7 @@ def run_e2e_experiment_create_by_tune_with_external_model( algorithm_name = "random", max_trial_count = 1, parallel_trial_count = 1, - resources_per_trial=types.TrainerResources( + resources_per_trial=katib.TrainerResources( num_workers=1, num_procs_per_worker=1, resources_per_worker={"cpu": "2", "memory": "10G",}, diff --git a/test/e2e/v1beta1/scripts/gh-actions/setup-katib.sh b/test/e2e/v1beta1/scripts/gh-actions/setup-katib.sh index 68f5e6d1a5d..d0b05caf712 100755 --- a/test/e2e/v1beta1/scripts/gh-actions/setup-katib.sh +++ b/test/e2e/v1beta1/scripts/gh-actions/setup-katib.sh @@ -25,7 +25,7 @@ DEPLOY_TRAINING_OPERATOR=${2:-false} WITH_DATABASE_TYPE=${3:-mysql} E2E_TEST_IMAGE_TAG="e2e-test" -TRAINING_OPERATOR_VERSION="v1.8.0" +TRAINING_OPERATOR_VERSION="v1.6.0-rc.0" echo "Start to install Katib" From 6101489db88f264cac13d08d6e8ff2213052ffa5 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Thu, 5 Sep 2024 15:05:33 +0800 Subject: [PATCH 09/57] update the name of experiment Signed-off-by: helenxie-bit --- .../v1beta1/scripts/gh-actions/run-e2e-tune-api.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py index 10bc75b41e6..640cb2a595b 100644 --- a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py +++ b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py @@ -152,7 +152,7 @@ def run_e2e_experiment_create_by_tune_with_external_model( client.CoreV1Api().patch_namespace(args.namespace, {'metadata': {'labels': namespace_labels}}) # Test with run_e2e_experiment_create_by_tune - exp_name = "tune-example" + exp_name = "tune-example-1" exp_namespace = args.namespace try: run_e2e_experiment_create_by_tune_with_custom_objective(katib_client, exp_name, exp_namespace) @@ -168,16 +168,17 @@ def run_e2e_experiment_create_by_tune_with_external_model( logging.info("---------------------------------------------------------------") katib_client.delete_experiment(exp_name, exp_namespace) + exp_name_2 = "tune-example-2" try: - run_e2e_experiment_create_by_tune_with_external_model(katib_client, exp_name, exp_namespace) + run_e2e_experiment_create_by_tune_with_external_model(katib_client, exp_name_2, exp_namespace) logging.info("---------------------------------------------------------------") - logging.info(f"E2E is succeeded for Experiment created by tune: {exp_namespace}/{exp_name}") + logging.info(f"E2E is succeeded for Experiment created by tune: {exp_namespace}/{exp_name_2}") except Exception as e: logging.info("---------------------------------------------------------------") - logging.info(f"E2E is failed for Experiment created by tune: {exp_namespace}/{exp_name}") + logging.info(f"E2E is failed for Experiment created by tune: {exp_namespace}/{exp_name_2}") raise e finally: # Delete the Experiment. logging.info("---------------------------------------------------------------") logging.info("---------------------------------------------------------------") - katib_client.delete_experiment(exp_name, exp_namespace) + katib_client.delete_experiment(exp_name_2, exp_namespace) From d67a1b8a0cb1f80b1a2b45f0d0149e8003cea822 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Thu, 5 Sep 2024 15:51:12 +0800 Subject: [PATCH 10/57] add step of checking pod Signed-off-by: helenxie-bit --- .github/workflows/e2e-test-tune-api.yaml | 12 ++++++++++++ .../v1beta1/scripts/gh-actions/run-e2e-tune-api.py | 2 +- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/.github/workflows/e2e-test-tune-api.yaml b/.github/workflows/e2e-test-tune-api.yaml index e72e6f6ef9b..31d3585cff2 100644 --- a/.github/workflows/e2e-test-tune-api.yaml +++ b/.github/workflows/e2e-test-tune-api.yaml @@ -33,6 +33,18 @@ jobs: with: tune-api: true training-operator: true + + - name: Check the status of Experiment and Trials + shell: bash + run: | + kubectl get pods -n default + + # describe pod + pod_name=$(kubectl get pods -n default -o jsonpath='{.items[?(@.metadata.labels.trial-name)].metadata.name}') + kubectl describe pod $pod_name -n default + + # check the logs of pod + kubectl logs $pod_name -n default strategy: fail-fast: false diff --git a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py index 640cb2a595b..135f40c6ef8 100644 --- a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py +++ b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py @@ -14,7 +14,7 @@ from verify import verify_experiment_results # Experiment timeout is 40 min. -EXPERIMENT_TIMEOUT = 60 * 40 +EXPERIMENT_TIMEOUT = 60 * 10 # The default logging config. logging.basicConfig(level=logging.INFO) From 295abb6f1786ca80d8006aa2ce9205fe6515fafb Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Thu, 5 Sep 2024 17:02:02 +0800 Subject: [PATCH 11/57] check the logs of pod Signed-off-by: helenxie-bit --- .github/workflows/e2e-test-tune-api.yaml | 12 ------- .../scripts/gh-actions/run-e2e-tune-api.py | 33 +++++++++++++++++-- 2 files changed, 30 insertions(+), 15 deletions(-) diff --git a/.github/workflows/e2e-test-tune-api.yaml b/.github/workflows/e2e-test-tune-api.yaml index 31d3585cff2..e72e6f6ef9b 100644 --- a/.github/workflows/e2e-test-tune-api.yaml +++ b/.github/workflows/e2e-test-tune-api.yaml @@ -33,18 +33,6 @@ jobs: with: tune-api: true training-operator: true - - - name: Check the status of Experiment and Trials - shell: bash - run: | - kubectl get pods -n default - - # describe pod - pod_name=$(kubectl get pods -n default -o jsonpath='{.items[?(@.metadata.labels.trial-name)].metadata.name}') - kubectl describe pod $pod_name -n default - - # check the logs of pod - kubectl logs $pod_name -n default strategy: fail-fast: false diff --git a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py index 135f40c6ef8..48496b864c6 100644 --- a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py +++ b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py @@ -3,13 +3,13 @@ import kubeflow.katib as katib import transformers -from kubeflow.katib import KatibClient, search, types +from kubeflow.katib import KatibClient, search from kubeflow.storage_initializer.hugging_face import ( HuggingFaceDatasetParams, HuggingFaceModelParams, HuggingFaceTrainerParams, ) -from kubernetes import client +from kubernetes import client, config from peft import LoraConfig from verify import verify_experiment_results @@ -19,6 +19,25 @@ # The default logging config. logging.basicConfig(level=logging.INFO) +# Function to get logs of the pod related to the experiment. +def get_experiment_pod_logs(namespace: str, exp_name: str): + v1 = client.CoreV1Api() + pods = v1.list_namespaced_pod(namespace) + + for pod in pods.items: + # Identify the pod associated with the experiment + if exp_name in pod.metadata.name: + logging.info(f"Describing pod: {pod.metadata.name}") + pod_description = v1.read_namespaced_pod(name=pod.metadata.name, namespace=namespace) + logging.info(pod_description) + + logging.info(f"Fetching logs for pod: {pod.metadata.name}") + pod_logs = v1.read_namespaced_pod_log(name=pod.metadata.name, namespace=namespace) + logging.info(pod_logs) + break + else: + logging.warning(f"No pod found for experiment: {exp_name}") + # Test for Experiment created with custom objective. def run_e2e_experiment_create_by_tune_with_custom_objective( @@ -144,6 +163,8 @@ def run_e2e_experiment_create_by_tune_with_external_model( if args.verbose: logging.getLogger().setLevel(logging.DEBUG) + config.load_kube_config() # Load Kubernetes config from the environment + katib_client = KatibClient() namespace_labels = client.CoreV1Api().read_namespace(args.namespace).metadata.labels @@ -163,6 +184,9 @@ def run_e2e_experiment_create_by_tune_with_external_model( logging.info(f"E2E is failed for Experiment created by tune: {exp_namespace}/{exp_name}") raise e finally: + # Describe and get logs of the experiment pod + get_experiment_pod_logs(exp_namespace, exp_name) + # Delete the Experiment. logging.info("---------------------------------------------------------------") logging.info("---------------------------------------------------------------") @@ -178,7 +202,10 @@ def run_e2e_experiment_create_by_tune_with_external_model( logging.info(f"E2E is failed for Experiment created by tune: {exp_namespace}/{exp_name_2}") raise e finally: + # Describe and get logs of the experiment pod + get_experiment_pod_logs(exp_namespace, exp_name_2) + # Delete the Experiment. logging.info("---------------------------------------------------------------") logging.info("---------------------------------------------------------------") - katib_client.delete_experiment(exp_name_2, exp_namespace) + #katib_client.delete_experiment(exp_name_2, exp_namespace) From e0a1b6dad38e555f604eae15a3f6aa3216bdbc37 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Thu, 5 Sep 2024 17:17:33 +0800 Subject: [PATCH 12/57] add check Signed-off-by: helenxie-bit --- .github/workflows/e2e-test-tune-api.yaml | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/.github/workflows/e2e-test-tune-api.yaml b/.github/workflows/e2e-test-tune-api.yaml index e72e6f6ef9b..31d3585cff2 100644 --- a/.github/workflows/e2e-test-tune-api.yaml +++ b/.github/workflows/e2e-test-tune-api.yaml @@ -33,6 +33,18 @@ jobs: with: tune-api: true training-operator: true + + - name: Check the status of Experiment and Trials + shell: bash + run: | + kubectl get pods -n default + + # describe pod + pod_name=$(kubectl get pods -n default -o jsonpath='{.items[?(@.metadata.labels.trial-name)].metadata.name}') + kubectl describe pod $pod_name -n default + + # check the logs of pod + kubectl logs $pod_name -n default strategy: fail-fast: false From 1df7df953ba74fd1f0890f0a6074c5cd2039ee47 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Thu, 5 Sep 2024 18:06:38 +0800 Subject: [PATCH 13/57] check reason for imagepullbackoff Signed-off-by: helenxie-bit --- .github/workflows/e2e-test-tune-api.yaml | 12 ------- .../workflows/template-e2e-test/action.yaml | 17 +++++++++ .../scripts/gh-actions/run-e2e-tune-api.py | 35 +++---------------- 3 files changed, 21 insertions(+), 43 deletions(-) diff --git a/.github/workflows/e2e-test-tune-api.yaml b/.github/workflows/e2e-test-tune-api.yaml index 31d3585cff2..e72e6f6ef9b 100644 --- a/.github/workflows/e2e-test-tune-api.yaml +++ b/.github/workflows/e2e-test-tune-api.yaml @@ -33,18 +33,6 @@ jobs: with: tune-api: true training-operator: true - - - name: Check the status of Experiment and Trials - shell: bash - run: | - kubectl get pods -n default - - # describe pod - pod_name=$(kubectl get pods -n default -o jsonpath='{.items[?(@.metadata.labels.trial-name)].metadata.name}') - kubectl describe pod $pod_name -n default - - # check the logs of pod - kubectl logs $pod_name -n default strategy: fail-fast: false diff --git a/.github/workflows/template-e2e-test/action.yaml b/.github/workflows/template-e2e-test/action.yaml index 7c9598df04b..ef91c647cea 100644 --- a/.github/workflows/template-e2e-test/action.yaml +++ b/.github/workflows/template-e2e-test/action.yaml @@ -47,3 +47,20 @@ runs: else ./test/e2e/v1beta1/scripts/gh-actions/run-e2e-experiment.sh ${{ inputs.experiments }} fi + + - name: Check disk space + shell: bash + run: | + df -hT + + - name: Check the status of Experiment and Trials + shell: bash + run: | + kubectl get pods -n default + + # describe pod + pod_name=$(kubectl get pods -n default -o jsonpath='{.items[?(@.metadata.labels.trial-name)].metadata.name}') + kubectl describe pod $pod_name -n default + + # check the logs of pod + kubectl logs $pod_name -n default -c metrics-logger-and-collector diff --git a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py index 48496b864c6..d18c4b66c4e 100644 --- a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py +++ b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py @@ -9,7 +9,7 @@ HuggingFaceModelParams, HuggingFaceTrainerParams, ) -from kubernetes import client, config +from kubernetes import client from peft import LoraConfig from verify import verify_experiment_results @@ -19,25 +19,6 @@ # The default logging config. logging.basicConfig(level=logging.INFO) -# Function to get logs of the pod related to the experiment. -def get_experiment_pod_logs(namespace: str, exp_name: str): - v1 = client.CoreV1Api() - pods = v1.list_namespaced_pod(namespace) - - for pod in pods.items: - # Identify the pod associated with the experiment - if exp_name in pod.metadata.name: - logging.info(f"Describing pod: {pod.metadata.name}") - pod_description = v1.read_namespaced_pod(name=pod.metadata.name, namespace=namespace) - logging.info(pod_description) - - logging.info(f"Fetching logs for pod: {pod.metadata.name}") - pod_logs = v1.read_namespaced_pod_log(name=pod.metadata.name, namespace=namespace) - logging.info(pod_logs) - break - else: - logging.warning(f"No pod found for experiment: {exp_name}") - # Test for Experiment created with custom objective. def run_e2e_experiment_create_by_tune_with_custom_objective( @@ -163,8 +144,6 @@ def run_e2e_experiment_create_by_tune_with_external_model( if args.verbose: logging.getLogger().setLevel(logging.DEBUG) - config.load_kube_config() # Load Kubernetes config from the environment - katib_client = KatibClient() namespace_labels = client.CoreV1Api().read_namespace(args.namespace).metadata.labels @@ -184,9 +163,6 @@ def run_e2e_experiment_create_by_tune_with_external_model( logging.info(f"E2E is failed for Experiment created by tune: {exp_namespace}/{exp_name}") raise e finally: - # Describe and get logs of the experiment pod - get_experiment_pod_logs(exp_namespace, exp_name) - # Delete the Experiment. logging.info("---------------------------------------------------------------") logging.info("---------------------------------------------------------------") @@ -201,11 +177,8 @@ def run_e2e_experiment_create_by_tune_with_external_model( logging.info("---------------------------------------------------------------") logging.info(f"E2E is failed for Experiment created by tune: {exp_namespace}/{exp_name_2}") raise e - finally: - # Describe and get logs of the experiment pod - get_experiment_pod_logs(exp_namespace, exp_name_2) - + #finally: # Delete the Experiment. - logging.info("---------------------------------------------------------------") - logging.info("---------------------------------------------------------------") + #logging.info("---------------------------------------------------------------") + #logging.info("---------------------------------------------------------------") #katib_client.delete_experiment(exp_name_2, exp_namespace) From d1e1311bd2af48e3b198f5ed411e96169cf58f2c Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Thu, 5 Sep 2024 19:17:19 +0800 Subject: [PATCH 14/57] revert timeout limit Signed-off-by: helenxie-bit --- .../workflows/template-e2e-test/action.yaml | 18 +------------ .../scripts/gh-actions/run-e2e-tune-api.py | 27 +++++++++---------- 2 files changed, 14 insertions(+), 31 deletions(-) diff --git a/.github/workflows/template-e2e-test/action.yaml b/.github/workflows/template-e2e-test/action.yaml index ef91c647cea..f5ea534cd2e 100644 --- a/.github/workflows/template-e2e-test/action.yaml +++ b/.github/workflows/template-e2e-test/action.yaml @@ -47,20 +47,4 @@ runs: else ./test/e2e/v1beta1/scripts/gh-actions/run-e2e-experiment.sh ${{ inputs.experiments }} fi - - - name: Check disk space - shell: bash - run: | - df -hT - - - name: Check the status of Experiment and Trials - shell: bash - run: | - kubectl get pods -n default - - # describe pod - pod_name=$(kubectl get pods -n default -o jsonpath='{.items[?(@.metadata.labels.trial-name)].metadata.name}') - kubectl describe pod $pod_name -n default - - # check the logs of pod - kubectl logs $pod_name -n default -c metrics-logger-and-collector + diff --git a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py index d18c4b66c4e..a425a3ea105 100644 --- a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py +++ b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py @@ -14,7 +14,7 @@ from verify import verify_experiment_results # Experiment timeout is 40 min. -EXPERIMENT_TIMEOUT = 60 * 10 +EXPERIMENT_TIMEOUT = 60 * 40 # The default logging config. logging.basicConfig(level=logging.INFO) @@ -152,33 +152,32 @@ def run_e2e_experiment_create_by_tune_with_external_model( client.CoreV1Api().patch_namespace(args.namespace, {'metadata': {'labels': namespace_labels}}) # Test with run_e2e_experiment_create_by_tune - exp_name = "tune-example-1" + exp_name = "tune-example" exp_namespace = args.namespace try: - run_e2e_experiment_create_by_tune_with_custom_objective(katib_client, exp_name, exp_namespace) + run_e2e_experiment_create_by_tune_with_custom_objective(katib_client, f"{exp_name}-1", exp_namespace) logging.info("---------------------------------------------------------------") - logging.info(f"E2E is succeeded for Experiment created by tune: {exp_namespace}/{exp_name}") + logging.info(f"E2E is succeeded for Experiment created by tune: {exp_namespace}/{f"{exp_name}-1"}") except Exception as e: logging.info("---------------------------------------------------------------") - logging.info(f"E2E is failed for Experiment created by tune: {exp_namespace}/{exp_name}") + logging.info(f"E2E is failed for Experiment created by tune: {exp_namespace}/{f"{exp_name}-1"}") raise e finally: # Delete the Experiment. logging.info("---------------------------------------------------------------") logging.info("---------------------------------------------------------------") - katib_client.delete_experiment(exp_name, exp_namespace) + katib_client.delete_experiment(f"{exp_name}-1", exp_namespace) - exp_name_2 = "tune-example-2" try: - run_e2e_experiment_create_by_tune_with_external_model(katib_client, exp_name_2, exp_namespace) + run_e2e_experiment_create_by_tune_with_external_model(katib_client, f"{exp_name}-2", exp_namespace) logging.info("---------------------------------------------------------------") - logging.info(f"E2E is succeeded for Experiment created by tune: {exp_namespace}/{exp_name_2}") + logging.info(f"E2E is succeeded for Experiment created by tune: {exp_namespace}/{f"{exp_name}-2"}") except Exception as e: logging.info("---------------------------------------------------------------") - logging.info(f"E2E is failed for Experiment created by tune: {exp_namespace}/{exp_name_2}") + logging.info(f"E2E is failed for Experiment created by tune: {exp_namespace}/{f"{exp_name}-2"}") raise e - #finally: + finally: # Delete the Experiment. - #logging.info("---------------------------------------------------------------") - #logging.info("---------------------------------------------------------------") - #katib_client.delete_experiment(exp_name_2, exp_namespace) + logging.info("---------------------------------------------------------------") + logging.info("---------------------------------------------------------------") + katib_client.delete_experiment(f"{exp_name}-2", exp_namespace) From 0cc319f7611593c32efbfc3eb603c4179d84b7bc Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Thu, 5 Sep 2024 19:32:18 +0800 Subject: [PATCH 15/57] fix format Signed-off-by: helenxie-bit --- .github/workflows/template-e2e-test/action.yaml | 1 - .../e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py | 10 +++++----- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/.github/workflows/template-e2e-test/action.yaml b/.github/workflows/template-e2e-test/action.yaml index f5ea534cd2e..7c9598df04b 100644 --- a/.github/workflows/template-e2e-test/action.yaml +++ b/.github/workflows/template-e2e-test/action.yaml @@ -47,4 +47,3 @@ runs: else ./test/e2e/v1beta1/scripts/gh-actions/run-e2e-experiment.sh ${{ inputs.experiments }} fi - diff --git a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py index a425a3ea105..707c8a431ba 100644 --- a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py +++ b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py @@ -157,24 +157,24 @@ def run_e2e_experiment_create_by_tune_with_external_model( try: run_e2e_experiment_create_by_tune_with_custom_objective(katib_client, f"{exp_name}-1", exp_namespace) logging.info("---------------------------------------------------------------") - logging.info(f"E2E is succeeded for Experiment created by tune: {exp_namespace}/{f"{exp_name}-1"}") + logging.info(f"E2E is succeeded for Experiment created by tune: {exp_namespace}/{exp_name}-1") except Exception as e: logging.info("---------------------------------------------------------------") - logging.info(f"E2E is failed for Experiment created by tune: {exp_namespace}/{f"{exp_name}-1"}") + logging.info(f"E2E is failed for Experiment created by tune: {exp_namespace}/{exp_name}-1") raise e finally: # Delete the Experiment. logging.info("---------------------------------------------------------------") logging.info("---------------------------------------------------------------") katib_client.delete_experiment(f"{exp_name}-1", exp_namespace) - + try: run_e2e_experiment_create_by_tune_with_external_model(katib_client, f"{exp_name}-2", exp_namespace) logging.info("---------------------------------------------------------------") - logging.info(f"E2E is succeeded for Experiment created by tune: {exp_namespace}/{f"{exp_name}-2"}") + logging.info(f"E2E is succeeded for Experiment created by tune: {exp_namespace}/{exp_name}-2") except Exception as e: logging.info("---------------------------------------------------------------") - logging.info(f"E2E is failed for Experiment created by tune: {exp_namespace}/{f"{exp_name}-2"}") + logging.info(f"E2E is failed for Experiment created by tune: {exp_namespace}/{exp_name}-2") raise e finally: # Delete the Experiment. From 03839326e70418bd9a44e55ea77869fa9155632c Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Thu, 12 Sep 2024 18:00:12 -0600 Subject: [PATCH 16/57] extend timeout limit Signed-off-by: helenxie-bit --- test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py index 707c8a431ba..e27d0d81d07 100644 --- a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py +++ b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py @@ -13,8 +13,8 @@ from peft import LoraConfig from verify import verify_experiment_results -# Experiment timeout is 40 min. -EXPERIMENT_TIMEOUT = 60 * 40 +# Experiment timeout is 60 min. +EXPERIMENT_TIMEOUT = 60 * 60 # The default logging config. logging.basicConfig(level=logging.INFO) From 08c86343d22ba5698140097a4e9f2cd80e71f86b Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Thu, 12 Sep 2024 18:01:51 -0600 Subject: [PATCH 17/57] update training operator sdk version Signed-off-by: helenxie-bit --- .github/workflows/e2e-test-tune-api.yaml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/e2e-test-tune-api.yaml b/.github/workflows/e2e-test-tune-api.yaml index e72e6f6ef9b..c8a728ea391 100644 --- a/.github/workflows/e2e-test-tune-api.yaml +++ b/.github/workflows/e2e-test-tune-api.yaml @@ -25,8 +25,7 @@ jobs: - name: Install Training Operator SDK shell: bash run: | - pip install git+https://github.com/kubeflow/training-operator.git@v1.8-branch#subdirectory=sdk/python - pip install peft==0.3.0 datasets==2.15.0 transformers==4.38.0 + pip install "kubeflow-training[huggingface]==1.8.1" - name: Run e2e test with tune API uses: ./.github/workflows/template-e2e-test From 7a98a001a0e2e5467269f84992dede8214c269e8 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Thu, 12 Sep 2024 22:54:10 -0600 Subject: [PATCH 18/57] check the logs of pod Signed-off-by: helenxie-bit --- .github/workflows/e2e-test-tune-api.yaml | 7 +++++ .../scripts/gh-actions/run-e2e-tune-api.py | 27 ++++++++++++++++--- 2 files changed, 30 insertions(+), 4 deletions(-) diff --git a/.github/workflows/e2e-test-tune-api.yaml b/.github/workflows/e2e-test-tune-api.yaml index c8a728ea391..9b9d9658410 100644 --- a/.github/workflows/e2e-test-tune-api.yaml +++ b/.github/workflows/e2e-test-tune-api.yaml @@ -33,6 +33,13 @@ jobs: tune-api: true training-operator: true + # Step to get logs of the relevant Experiment pod + - name: Fetch Experiment Pod Logs + run: | + POD_NAME=$(kubectl get pods -n default --no-headers -o custom-columns=":metadata.name" | grep tune-example-2) + echo "Fetching logs for pod: $POD_NAME" + kubectl logs $POD_NAME -n default + strategy: fail-fast: false matrix: diff --git a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py index e27d0d81d07..e35df2aa204 100644 --- a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py +++ b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py @@ -14,12 +14,30 @@ from verify import verify_experiment_results # Experiment timeout is 60 min. -EXPERIMENT_TIMEOUT = 60 * 60 +EXPERIMENT_TIMEOUT = 60 * 15 # The default logging config. logging.basicConfig(level=logging.INFO) +def get_experiment_pods_logs(katib_client: KatibClient, exp_name: str, exp_namespace: str): + # List all the pods in the namespace + v1 = client.CoreV1Api() + pods = v1.list_namespaced_pod(namespace=exp_namespace) + + # Filter pods related to the specific Katib Experiment + for pod in pods.items: + if exp_name in pod.metadata.name: + logging.info(f"Fetching logs for pod: {pod.metadata.name}") + try: + pod_logs = v1.read_namespaced_pod_log( + name=pod.metadata.name, namespace=exp_namespace + ) + logging.info(f"Logs for pod {pod.metadata.name}:\n{pod_logs}") + except Exception as e: + logging.error(f"Failed to get logs for pod {pod.metadata.name}: {str(e)}") + + # Test for Experiment created with custom objective. def run_e2e_experiment_create_by_tune_with_custom_objective( katib_client: KatibClient, @@ -117,7 +135,7 @@ def run_e2e_experiment_create_by_tune_with_external_model( resources_per_trial=katib.TrainerResources( num_workers=1, num_procs_per_worker=1, - resources_per_worker={"cpu": "2", "memory": "10G",}, + resources_per_worker={"cpu": "1", "memory": "10G",}, ), ) experiment = katib_client.wait_for_experiment_condition( @@ -166,7 +184,7 @@ def run_e2e_experiment_create_by_tune_with_external_model( # Delete the Experiment. logging.info("---------------------------------------------------------------") logging.info("---------------------------------------------------------------") - katib_client.delete_experiment(f"{exp_name}-1", exp_namespace) + #katib_client.delete_experiment(f"{exp_name}-1", exp_namespace) try: run_e2e_experiment_create_by_tune_with_external_model(katib_client, f"{exp_name}-2", exp_namespace) @@ -175,9 +193,10 @@ def run_e2e_experiment_create_by_tune_with_external_model( except Exception as e: logging.info("---------------------------------------------------------------") logging.info(f"E2E is failed for Experiment created by tune: {exp_namespace}/{exp_name}-2") + get_experiment_pods_logs(katib_client, f"{exp_name}-2", exp_namespace) raise e finally: # Delete the Experiment. logging.info("---------------------------------------------------------------") logging.info("---------------------------------------------------------------") - katib_client.delete_experiment(f"{exp_name}-2", exp_namespace) + #katib_client.delete_experiment(f"{exp_name}-2", exp_namespace) From 8862d7965c4b03db2cdb5f36f73a14eaf4af10a9 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Thu, 12 Sep 2024 22:57:06 -0600 Subject: [PATCH 19/57] rerun tests Signed-off-by: helenxie-bit --- test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py index e35df2aa204..3a7c4949cc7 100644 --- a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py +++ b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py @@ -14,7 +14,7 @@ from verify import verify_experiment_results # Experiment timeout is 60 min. -EXPERIMENT_TIMEOUT = 60 * 15 +EXPERIMENT_TIMEOUT = 60 * 10 # The default logging config. logging.basicConfig(level=logging.INFO) From e4f614dd1e140a6464a3193542a7c664b8c0783d Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Fri, 13 Sep 2024 22:13:47 -0600 Subject: [PATCH 20/57] update the function of getting logs Signed-off-by: helenxie-bit --- test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py index 3a7c4949cc7..5dae31ade3e 100644 --- a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py +++ b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py @@ -30,10 +30,13 @@ def get_experiment_pods_logs(katib_client: KatibClient, exp_name: str, exp_names if exp_name in pod.metadata.name: logging.info(f"Fetching logs for pod: {pod.metadata.name}") try: + # Specify the container name when retrieving logs pod_logs = v1.read_namespaced_pod_log( - name=pod.metadata.name, namespace=exp_namespace + name=pod.metadata.name, + namespace=exp_namespace, + container="metrics-logger-and-collector" # Specify the desired container ) - logging.info(f"Logs for pod {pod.metadata.name}:\n{pod_logs}") + logging.info(f"Logs for pod {pod.metadata.name} (container: metrics-logger-and-collector):\n{pod_logs}") except Exception as e: logging.error(f"Failed to get logs for pod {pod.metadata.name}: {str(e)}") From 0385eeaef053ce7f27cca0adb1146b67c93f6e69 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Fri, 13 Sep 2024 22:48:27 -0600 Subject: [PATCH 21/57] add the step of describing pod Signed-off-by: helenxie-bit --- .github/workflows/e2e-test-tune-api.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/e2e-test-tune-api.yaml b/.github/workflows/e2e-test-tune-api.yaml index 9b9d9658410..612463bbbfc 100644 --- a/.github/workflows/e2e-test-tune-api.yaml +++ b/.github/workflows/e2e-test-tune-api.yaml @@ -35,9 +35,11 @@ jobs: # Step to get logs of the relevant Experiment pod - name: Fetch Experiment Pod Logs + if: always() # Run this step even if previous steps fail run: | POD_NAME=$(kubectl get pods -n default --no-headers -o custom-columns=":metadata.name" | grep tune-example-2) echo "Fetching logs for pod: $POD_NAME" + kubectl describe pod $POD_NAME -n default kubectl logs $POD_NAME -n default strategy: From e0c51704111109314e5242d57050a542a77a5d8b Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Fri, 13 Sep 2024 23:28:49 -0600 Subject: [PATCH 22/57] check disk space Signed-off-by: helenxie-bit --- .github/workflows/e2e-test-tune-api.yaml | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/.github/workflows/e2e-test-tune-api.yaml b/.github/workflows/e2e-test-tune-api.yaml index 612463bbbfc..6d10e3569ff 100644 --- a/.github/workflows/e2e-test-tune-api.yaml +++ b/.github/workflows/e2e-test-tune-api.yaml @@ -27,11 +27,24 @@ jobs: run: | pip install "kubeflow-training[huggingface]==1.8.1" + # Step to check disk space + - name: Check Disk Space + run: | + echo "Checking disk space usage before e2e test..." + df -h # Run 'df' to check free disk space + - name: Run e2e test with tune API uses: ./.github/workflows/template-e2e-test with: tune-api: true training-operator: true + + # Step to check disk space + - name: Check Disk Space + if: always() # Run this step even if previous steps fail + run: | + echo "Checking disk space usage after e2e test..." + df -h # Run 'df' to check free disk space # Step to get logs of the relevant Experiment pod - name: Fetch Experiment Pod Logs From 0286f7077a52fb4bd106d2dc2e71021d3ae04c56 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Mon, 16 Sep 2024 20:19:58 -0600 Subject: [PATCH 23/57] change work directory Signed-off-by: helenxie-bit --- .github/workflows/e2e-test-tune-api.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/e2e-test-tune-api.yaml b/.github/workflows/e2e-test-tune-api.yaml index 6d10e3569ff..0748c681fbb 100644 --- a/.github/workflows/e2e-test-tune-api.yaml +++ b/.github/workflows/e2e-test-tune-api.yaml @@ -13,6 +13,8 @@ jobs: e2e: runs-on: ubuntu-22.04 timeout-minutes: 120 + env: + GITHUB_WORKSPACE: /mnt/docker steps: - name: Checkout uses: actions/checkout@v4 From f6e5ed569d86047d0cace7cec709831cbaa6f4e2 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Mon, 16 Sep 2024 21:02:09 -0600 Subject: [PATCH 24/57] change work directory Signed-off-by: helenxie-bit --- .github/workflows/e2e-test-tune-api.yaml | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/.github/workflows/e2e-test-tune-api.yaml b/.github/workflows/e2e-test-tune-api.yaml index 0748c681fbb..329b2ae6173 100644 --- a/.github/workflows/e2e-test-tune-api.yaml +++ b/.github/workflows/e2e-test-tune-api.yaml @@ -13,11 +13,27 @@ jobs: e2e: runs-on: ubuntu-22.04 timeout-minutes: 120 - env: - GITHUB_WORKSPACE: /mnt/docker steps: - name: Checkout uses: actions/checkout@v4 + + - name: Move docker data directory + shell: bash + run: | + echo "Stopping docker service ..." + sudo systemctl stop docker + DOCKER_DEFAULT_ROOT_DIR=/var/lib/docker + DOCKER_ROOT_DIR=/mnt/docker + echo "Moving ${DOCKER_DEFAULT_ROOT_DIR} -> ${DOCKER_ROOT_DIR}" + sudo mv ${DOCKER_DEFAULT_ROOT_DIR} ${DOCKER_ROOT_DIR} + echo "Creating symlink ${DOCKER_DEFAULT_ROOT_DIR} -> ${DOCKER_ROOT_DIR}" + sudo ln -s ${DOCKER_ROOT_DIR} ${DOCKER_DEFAULT_ROOT_DIR} + echo "$(sudo ls -l ${DOCKER_DEFAULT_ROOT_DIR})" + echo "Starting docker service ..." + sudo systemctl daemon-reload + sudo systemctl start docker + echo "Docker service status:" + sudo systemctl --no-pager -l -o short status docker - name: Setup Test Env uses: ./.github/workflows/template-setup-e2e-test From 7ea7e43b17fc1b2d4e6654bcd835c6485df5ca58 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Mon, 16 Sep 2024 21:36:56 -0600 Subject: [PATCH 25/57] increase timeout limit Signed-off-by: helenxie-bit --- test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py index 5dae31ade3e..1a5f9eed99d 100644 --- a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py +++ b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py @@ -13,8 +13,8 @@ from peft import LoraConfig from verify import verify_experiment_results -# Experiment timeout is 60 min. -EXPERIMENT_TIMEOUT = 60 * 10 +# Experiment timeout is 40 min. +EXPERIMENT_TIMEOUT = 60 * 40 # The default logging config. logging.basicConfig(level=logging.INFO) From 25d99b198fbbc27b580039c03410906f91ba009f Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Mon, 16 Sep 2024 22:46:52 -0600 Subject: [PATCH 26/57] check the logs of controller and events Signed-off-by: helenxie-bit --- .github/workflows/e2e-test-tune-api.yaml | 2 ++ test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py | 3 ++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/e2e-test-tune-api.yaml b/.github/workflows/e2e-test-tune-api.yaml index 329b2ae6173..68426f23bed 100644 --- a/.github/workflows/e2e-test-tune-api.yaml +++ b/.github/workflows/e2e-test-tune-api.yaml @@ -68,10 +68,12 @@ jobs: - name: Fetch Experiment Pod Logs if: always() # Run this step even if previous steps fail run: | + kubectl get pods -n default POD_NAME=$(kubectl get pods -n default --no-headers -o custom-columns=":metadata.name" | grep tune-example-2) echo "Fetching logs for pod: $POD_NAME" kubectl describe pod $POD_NAME -n default kubectl logs $POD_NAME -n default + kubectl get events -n default | grep "tune-example-2" strategy: fail-fast: false diff --git a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py index 1a5f9eed99d..9e327ac6adf 100644 --- a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py +++ b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py @@ -14,7 +14,7 @@ from verify import verify_experiment_results # Experiment timeout is 40 min. -EXPERIMENT_TIMEOUT = 60 * 40 +EXPERIMENT_TIMEOUT = 60 * 15 # The default logging config. logging.basicConfig(level=logging.INFO) @@ -197,6 +197,7 @@ def run_e2e_experiment_create_by_tune_with_external_model( logging.info("---------------------------------------------------------------") logging.info(f"E2E is failed for Experiment created by tune: {exp_namespace}/{exp_name}-2") get_experiment_pods_logs(katib_client, f"{exp_name}-2", exp_namespace) + get_experiment_pods_logs(katib_client, "katib-controller", "kubeflow") raise e finally: # Delete the Experiment. From fcd64faad5bbd38a5849e4bde44ed8f927c1b85d Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Tue, 17 Sep 2024 19:32:28 -0700 Subject: [PATCH 27/57] change work directory Signed-off-by: helenxie-bit --- .github/workflows/e2e-test-tune-api.yaml | 18 ---------------- .../workflows/template-e2e-test/action.yaml | 1 + .../scripts/gh-actions/run-e2e-tune-api.py | 21 ++++++++++++++++++- 3 files changed, 21 insertions(+), 19 deletions(-) diff --git a/.github/workflows/e2e-test-tune-api.yaml b/.github/workflows/e2e-test-tune-api.yaml index 68426f23bed..909d0022ce5 100644 --- a/.github/workflows/e2e-test-tune-api.yaml +++ b/.github/workflows/e2e-test-tune-api.yaml @@ -16,24 +16,6 @@ jobs: steps: - name: Checkout uses: actions/checkout@v4 - - - name: Move docker data directory - shell: bash - run: | - echo "Stopping docker service ..." - sudo systemctl stop docker - DOCKER_DEFAULT_ROOT_DIR=/var/lib/docker - DOCKER_ROOT_DIR=/mnt/docker - echo "Moving ${DOCKER_DEFAULT_ROOT_DIR} -> ${DOCKER_ROOT_DIR}" - sudo mv ${DOCKER_DEFAULT_ROOT_DIR} ${DOCKER_ROOT_DIR} - echo "Creating symlink ${DOCKER_DEFAULT_ROOT_DIR} -> ${DOCKER_ROOT_DIR}" - sudo ln -s ${DOCKER_ROOT_DIR} ${DOCKER_DEFAULT_ROOT_DIR} - echo "$(sudo ls -l ${DOCKER_DEFAULT_ROOT_DIR})" - echo "Starting docker service ..." - sudo systemctl daemon-reload - sudo systemctl start docker - echo "Docker service status:" - sudo systemctl --no-pager -l -o short status docker - name: Setup Test Env uses: ./.github/workflows/template-setup-e2e-test diff --git a/.github/workflows/template-e2e-test/action.yaml b/.github/workflows/template-e2e-test/action.yaml index 7c9598df04b..c4a8c8831e4 100644 --- a/.github/workflows/template-e2e-test/action.yaml +++ b/.github/workflows/template-e2e-test/action.yaml @@ -47,3 +47,4 @@ runs: else ./test/e2e/v1beta1/scripts/gh-actions/run-e2e-experiment.sh ${{ inputs.experiments }} fi + working-directory: /mnt/docker diff --git a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py index 9e327ac6adf..39c6d683488 100644 --- a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py +++ b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py @@ -36,6 +36,25 @@ def get_experiment_pods_logs(katib_client: KatibClient, exp_name: str, exp_names namespace=exp_namespace, container="metrics-logger-and-collector" # Specify the desired container ) + logging.info(f"Logs for pod {pod.metadata.name}:\n{pod_logs}") + except Exception as e: + logging.error(f"Failed to get logs for pod {pod.metadata.name}: {str(e)}") + +def get_experiment_pods_logs_2(katib_client: KatibClient, exp_name: str, exp_namespace: str): + # List all the pods in the namespace + v1 = client.CoreV1Api() + pods = v1.list_namespaced_pod(namespace=exp_namespace) + + # Filter pods related to the specific Katib Experiment + for pod in pods.items: + if exp_name in pod.metadata.name: + logging.info(f"Fetching logs for pod: {pod.metadata.name}") + try: + # Specify the container name when retrieving logs + pod_logs = v1.read_namespaced_pod_log( + name=pod.metadata.name, + namespace=exp_namespace, + ) logging.info(f"Logs for pod {pod.metadata.name} (container: metrics-logger-and-collector):\n{pod_logs}") except Exception as e: logging.error(f"Failed to get logs for pod {pod.metadata.name}: {str(e)}") @@ -197,7 +216,7 @@ def run_e2e_experiment_create_by_tune_with_external_model( logging.info("---------------------------------------------------------------") logging.info(f"E2E is failed for Experiment created by tune: {exp_namespace}/{exp_name}-2") get_experiment_pods_logs(katib_client, f"{exp_name}-2", exp_namespace) - get_experiment_pods_logs(katib_client, "katib-controller", "kubeflow") + get_experiment_pods_logs_2(katib_client, "katib-controller", "kubeflow") raise e finally: # Delete the Experiment. From 122c6115d2ed8c884adb4ef5b59b552f4ad029db Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Tue, 17 Sep 2024 19:43:19 -0700 Subject: [PATCH 28/57] change work directory Signed-off-by: helenxie-bit --- .github/workflows/template-e2e-test/action.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/template-e2e-test/action.yaml b/.github/workflows/template-e2e-test/action.yaml index c4a8c8831e4..dd58cb12e46 100644 --- a/.github/workflows/template-e2e-test/action.yaml +++ b/.github/workflows/template-e2e-test/action.yaml @@ -47,4 +47,4 @@ runs: else ./test/e2e/v1beta1/scripts/gh-actions/run-e2e-experiment.sh ${{ inputs.experiments }} fi - working-directory: /mnt/docker + working-directory: /mnt From c1fde099fc0f4f8753d2d5fa57824f3846f962ee Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Tue, 17 Sep 2024 20:31:25 -0700 Subject: [PATCH 29/57] change work directory Signed-off-by: helenxie-bit --- .../workflows/template-e2e-test/action.yaml | 1 - .../template-setup-e2e-test/action.yaml | 25 +++++++++++++++++++ 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/.github/workflows/template-e2e-test/action.yaml b/.github/workflows/template-e2e-test/action.yaml index dd58cb12e46..7c9598df04b 100644 --- a/.github/workflows/template-e2e-test/action.yaml +++ b/.github/workflows/template-e2e-test/action.yaml @@ -47,4 +47,3 @@ runs: else ./test/e2e/v1beta1/scripts/gh-actions/run-e2e-experiment.sh ${{ inputs.experiments }} fi - working-directory: /mnt diff --git a/.github/workflows/template-setup-e2e-test/action.yaml b/.github/workflows/template-setup-e2e-test/action.yaml index 75ee040aea2..657113afc4d 100644 --- a/.github/workflows/template-setup-e2e-test/action.yaml +++ b/.github/workflows/template-setup-e2e-test/action.yaml @@ -30,6 +30,31 @@ runs: echo "Disk usage after cleanup:" df -h + + - name: Prune docker images + shell: bash + run: | + docker image prune -a -f + docker system df + df -hT + + - name: Move docker data directory + shell: bash + run: | + echo "Stopping docker service ..." + sudo systemctl stop docker + DOCKER_DEFAULT_ROOT_DIR=/var/lib/docker + DOCKER_ROOT_DIR=/mnt/docker + echo "Moving ${DOCKER_DEFAULT_ROOT_DIR} -> ${DOCKER_ROOT_DIR}" + sudo mv ${DOCKER_DEFAULT_ROOT_DIR} ${DOCKER_ROOT_DIR} + echo "Creating symlink ${DOCKER_DEFAULT_ROOT_DIR} -> ${DOCKER_ROOT_DIR}" + sudo ln -s ${DOCKER_ROOT_DIR} ${DOCKER_DEFAULT_ROOT_DIR} + echo "$(sudo ls -l ${DOCKER_DEFAULT_ROOT_DIR})" + echo "Starting docker service ..." + sudo systemctl daemon-reload + sudo systemctl start docker + echo "Docker service status:" + sudo systemctl --no-pager -l -o short status docker - name: Setup kubectl uses: azure/setup-kubectl@v4 From 8ff6864ae9f7b6f2de4abb715ccabbe11bd653a5 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Tue, 17 Sep 2024 21:41:16 -0700 Subject: [PATCH 30/57] check the logs of kubelet Signed-off-by: helenxie-bit --- .github/workflows/e2e-test-tune-api.yaml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/.github/workflows/e2e-test-tune-api.yaml b/.github/workflows/e2e-test-tune-api.yaml index 909d0022ce5..f374c72c291 100644 --- a/.github/workflows/e2e-test-tune-api.yaml +++ b/.github/workflows/e2e-test-tune-api.yaml @@ -56,6 +56,14 @@ jobs: kubectl describe pod $POD_NAME -n default kubectl logs $POD_NAME -n default kubectl get events -n default | grep "tune-example-2" + + # Step to fetch kubelet logs from Minikube + - name: Fetch Kubelet Logs + if: always() # Run this step even if previous steps fail + shell: bash + run: | + echo "Fetching kubelet logs from Minikube..." + minikube ssh "sudo journalctl -u kubelet" strategy: fail-fast: false From da3c298c1ae4d3cac398bbab5f4ab56974d93f6e Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Tue, 17 Sep 2024 22:11:05 -0700 Subject: [PATCH 31/57] check the logs of kubelet Signed-off-by: helenxie-bit --- .github/workflows/e2e-test-tune-api.yaml | 4 ++-- test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/e2e-test-tune-api.yaml b/.github/workflows/e2e-test-tune-api.yaml index f374c72c291..3f7219e1077 100644 --- a/.github/workflows/e2e-test-tune-api.yaml +++ b/.github/workflows/e2e-test-tune-api.yaml @@ -62,8 +62,8 @@ jobs: if: always() # Run this step even if previous steps fail shell: bash run: | - echo "Fetching kubelet logs from Minikube..." - minikube ssh "sudo journalctl -u kubelet" + echo "Fetching kubelet logs..." + sudo journalctl -u kubelet strategy: fail-fast: false diff --git a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py index 39c6d683488..f83eb639cbd 100644 --- a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py +++ b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py @@ -14,7 +14,7 @@ from verify import verify_experiment_results # Experiment timeout is 40 min. -EXPERIMENT_TIMEOUT = 60 * 15 +EXPERIMENT_TIMEOUT = 60 * 10 # The default logging config. logging.basicConfig(level=logging.INFO) From a1bff26b6ba3999c15c50e77eba096aacd4fc41d Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Wed, 18 Sep 2024 17:49:44 -0700 Subject: [PATCH 32/57] increase cpu Signed-off-by: helenxie-bit --- .github/workflows/e2e-test-tune-api.yaml | 1 + test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/e2e-test-tune-api.yaml b/.github/workflows/e2e-test-tune-api.yaml index 3f7219e1077..d657928b07e 100644 --- a/.github/workflows/e2e-test-tune-api.yaml +++ b/.github/workflows/e2e-test-tune-api.yaml @@ -55,6 +55,7 @@ jobs: echo "Fetching logs for pod: $POD_NAME" kubectl describe pod $POD_NAME -n default kubectl logs $POD_NAME -n default + kubectl top pods $POD_NAME kubectl get events -n default | grep "tune-example-2" # Step to fetch kubelet logs from Minikube diff --git a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py index f83eb639cbd..eeac5d10d7f 100644 --- a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py +++ b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py @@ -157,7 +157,7 @@ def run_e2e_experiment_create_by_tune_with_external_model( resources_per_trial=katib.TrainerResources( num_workers=1, num_procs_per_worker=1, - resources_per_worker={"cpu": "1", "memory": "10G",}, + resources_per_worker={"cpu": "2", "memory": "10G",}, ), ) experiment = katib_client.wait_for_experiment_condition( From bbae57bb3c217756f61f5220d5be2ba873455e5c Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Wed, 18 Sep 2024 18:27:46 -0700 Subject: [PATCH 33/57] check the logs of training operator Signed-off-by: helenxie-bit --- .github/workflows/e2e-test-tune-api.yaml | 2 +- test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/e2e-test-tune-api.yaml b/.github/workflows/e2e-test-tune-api.yaml index d657928b07e..e80f3067fd6 100644 --- a/.github/workflows/e2e-test-tune-api.yaml +++ b/.github/workflows/e2e-test-tune-api.yaml @@ -51,7 +51,7 @@ jobs: if: always() # Run this step even if previous steps fail run: | kubectl get pods -n default - POD_NAME=$(kubectl get pods -n default --no-headers -o custom-columns=":metadata.name" | grep tune-example-2) + POD_NAME=$(kubectl get pods -n default --no-headers -o custom-columns=":metadata.name" | grep tune-example-2 | grep master) echo "Fetching logs for pod: $POD_NAME" kubectl describe pod $POD_NAME -n default kubectl logs $POD_NAME -n default diff --git a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py index eeac5d10d7f..4168a8e3786 100644 --- a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py +++ b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py @@ -217,6 +217,7 @@ def run_e2e_experiment_create_by_tune_with_external_model( logging.info(f"E2E is failed for Experiment created by tune: {exp_namespace}/{exp_name}-2") get_experiment_pods_logs(katib_client, f"{exp_name}-2", exp_namespace) get_experiment_pods_logs_2(katib_client, "katib-controller", "kubeflow") + get_experiment_pods_logs_2(katib_client, "training-operator", "kubeflow") raise e finally: # Delete the Experiment. From e45ceac4745e1317feab569256b75d5e2305bffb Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Wed, 18 Sep 2024 19:22:12 -0700 Subject: [PATCH 34/57] check the use of resources Signed-off-by: helenxie-bit --- .github/workflows/e2e-test-tune-api.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/e2e-test-tune-api.yaml b/.github/workflows/e2e-test-tune-api.yaml index e80f3067fd6..32cf0a12add 100644 --- a/.github/workflows/e2e-test-tune-api.yaml +++ b/.github/workflows/e2e-test-tune-api.yaml @@ -54,7 +54,6 @@ jobs: POD_NAME=$(kubectl get pods -n default --no-headers -o custom-columns=":metadata.name" | grep tune-example-2 | grep master) echo "Fetching logs for pod: $POD_NAME" kubectl describe pod $POD_NAME -n default - kubectl logs $POD_NAME -n default kubectl top pods $POD_NAME kubectl get events -n default | grep "tune-example-2" From 4ae11edbe725c52005587091b39e3f84816641fb Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Thu, 19 Sep 2024 20:47:11 -0700 Subject: [PATCH 35/57] check the logs of container 'pytorch' and 'storage_initializer' Signed-off-by: helenxie-bit --- .github/workflows/e2e-test-tune-api.yaml | 1 - .../scripts/gh-actions/run-e2e-tune-api.py | 16 ++++++++++++++-- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/.github/workflows/e2e-test-tune-api.yaml b/.github/workflows/e2e-test-tune-api.yaml index 32cf0a12add..7b41130f499 100644 --- a/.github/workflows/e2e-test-tune-api.yaml +++ b/.github/workflows/e2e-test-tune-api.yaml @@ -52,7 +52,6 @@ jobs: run: | kubectl get pods -n default POD_NAME=$(kubectl get pods -n default --no-headers -o custom-columns=":metadata.name" | grep tune-example-2 | grep master) - echo "Fetching logs for pod: $POD_NAME" kubectl describe pod $POD_NAME -n default kubectl top pods $POD_NAME kubectl get events -n default | grep "tune-example-2" diff --git a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py index 4168a8e3786..e5eb39c0d4a 100644 --- a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py +++ b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py @@ -31,12 +31,24 @@ def get_experiment_pods_logs(katib_client: KatibClient, exp_name: str, exp_names logging.info(f"Fetching logs for pod: {pod.metadata.name}") try: # Specify the container name when retrieving logs - pod_logs = v1.read_namespaced_pod_log( + pod_logs1 = v1.read_namespaced_pod_log( name=pod.metadata.name, namespace=exp_namespace, container="metrics-logger-and-collector" # Specify the desired container ) - logging.info(f"Logs for pod {pod.metadata.name}:\n{pod_logs}") + logging.info(f"Logs for pod {pod.metadata.name}:\n{pod_logs1}") + pod_logs2 = v1.read_namespaced_pod_log( + name=pod.metadata.name, + namespace=exp_namespace, + container="pytorch" + ) + logging.info(f"Logs for pod {pod.metadata.name}:\n{pod_logs2}") + pod_logs3 = v1.read_namespaced_pod_log( + name=pod.metadata.name, + namespace=exp_namespace, + container="storage-initializer" + ) + logging.info(f"Logs for pod {pod.metadata.name}:\n{pod_logs3}") except Exception as e: logging.error(f"Failed to get logs for pod {pod.metadata.name}: {str(e)}") From bedab365208cd07d45c7665c9ffe705ce6419bd3 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Thu, 19 Sep 2024 21:44:12 -0700 Subject: [PATCH 36/57] fix error of checking use of resources Signed-off-by: helenxie-bit --- .github/workflows/e2e-test-tune-api.yaml | 5 ++++- .../v1beta1/scripts/gh-actions/run-e2e-tune-api.py | 13 +++++++++---- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/.github/workflows/e2e-test-tune-api.yaml b/.github/workflows/e2e-test-tune-api.yaml index 7b41130f499..12a890d58d7 100644 --- a/.github/workflows/e2e-test-tune-api.yaml +++ b/.github/workflows/e2e-test-tune-api.yaml @@ -53,8 +53,11 @@ jobs: kubectl get pods -n default POD_NAME=$(kubectl get pods -n default --no-headers -o custom-columns=":metadata.name" | grep tune-example-2 | grep master) kubectl describe pod $POD_NAME -n default - kubectl top pods $POD_NAME kubectl get events -n default | grep "tune-example-2" + kubectl get apiservices | grep metrics + minikube addons enable metrics-server + kubectl get pods -n kube-system + kubectl top pods $POD_NAME # Step to fetch kubelet logs from Minikube - name: Fetch Kubelet Logs diff --git a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py index e5eb39c0d4a..6a125c3033a 100644 --- a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py +++ b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py @@ -34,21 +34,26 @@ def get_experiment_pods_logs(katib_client: KatibClient, exp_name: str, exp_names pod_logs1 = v1.read_namespaced_pod_log( name=pod.metadata.name, namespace=exp_namespace, - container="metrics-logger-and-collector" # Specify the desired container + container="metrics-logger-and-collector" ) - logging.info(f"Logs for pod {pod.metadata.name}:\n{pod_logs1}") + logging.info(f"Logs of metrics-logger-and-collector for pod {pod.metadata.name}:\n{pod_logs1}") pod_logs2 = v1.read_namespaced_pod_log( name=pod.metadata.name, namespace=exp_namespace, container="pytorch" ) - logging.info(f"Logs for pod {pod.metadata.name}:\n{pod_logs2}") + logging.info(f"Logs of pytorch for pod {pod.metadata.name}:\n{pod_logs2}") pod_logs3 = v1.read_namespaced_pod_log( name=pod.metadata.name, namespace=exp_namespace, container="storage-initializer" ) - logging.info(f"Logs for pod {pod.metadata.name}:\n{pod_logs3}") + logging.info(f"Logs of storage-initializer for pod {pod.metadata.name}:\n{pod_logs3}") + pod_logs4 = v1.read_namespaced_pod_log( + name=pod.metadata.name, + namespace=exp_namespace, + ) + logging.info(f"Logs for pod {pod.metadata.name}:\n{pod_logs4}") except Exception as e: logging.error(f"Failed to get logs for pod {pod.metadata.name}: {str(e)}") From 7bfb3cc2df5b6d8b3ac805d70664d48c15dbeffa Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Thu, 19 Sep 2024 22:41:42 -0700 Subject: [PATCH 37/57] add other checks to find the error reason Signed-off-by: helenxie-bit --- .github/workflows/e2e-test-tune-api.yaml | 60 +++++++++++++++++++----- 1 file changed, 47 insertions(+), 13 deletions(-) diff --git a/.github/workflows/e2e-test-tune-api.yaml b/.github/workflows/e2e-test-tune-api.yaml index 12a890d58d7..12aac4d2b36 100644 --- a/.github/workflows/e2e-test-tune-api.yaml +++ b/.github/workflows/e2e-test-tune-api.yaml @@ -27,48 +27,82 @@ jobs: run: | pip install "kubeflow-training[huggingface]==1.8.1" - # Step to check disk space - - name: Check Disk Space + # Step 2: Check Disk Space Before Test + - name: Check Disk Space Before Test run: | echo "Checking disk space usage before e2e test..." df -h # Run 'df' to check free disk space + # Step 3: Run e2e test with tune API - name: Run e2e test with tune API uses: ./.github/workflows/template-e2e-test with: tune-api: true training-operator: true - # Step to check disk space - - name: Check Disk Space + # Step 4: Check Disk Space After Test + - name: Check Disk Space After Test if: always() # Run this step even if previous steps fail run: | echo "Checking disk space usage after e2e test..." df -h # Run 'df' to check free disk space - # Step to get logs of the relevant Experiment pod + # Step 5: Fetch Pod Logs for Relevant Experiment Pod - name: Fetch Experiment Pod Logs if: always() # Run this step even if previous steps fail run: | + echo "Fetching logs for experiment pod..." kubectl get pods -n default POD_NAME=$(kubectl get pods -n default --no-headers -o custom-columns=":metadata.name" | grep tune-example-2 | grep master) kubectl describe pod $POD_NAME -n default kubectl get events -n default | grep "tune-example-2" - kubectl get apiservices | grep metrics - minikube addons enable metrics-server - kubectl get pods -n kube-system - kubectl top pods $POD_NAME - - # Step to fetch kubelet logs from Minikube + + # Step 6: Fetch kubelet logs (requires sudo for accessing kubelet logs) - name: Fetch Kubelet Logs if: always() # Run this step even if previous steps fail - shell: bash run: | echo "Fetching kubelet logs..." sudo journalctl -u kubelet + + # Step 7: Check Node Resource Usage + - name: Check Node Resource Usage + if: always() + run: | + echo "Checking node resource usage..." + NODE_NAME=$(kubectl get pods -n default -o jsonpath="{.items[0].spec.nodeName}") + kubectl top node $NODE_NAME + + # Step 8: Check Pod Resource Usage + - name: Check Pod Resource Usage + if: always() + run: | + echo "Checking pod resource usage..." + kubectl top pod -n default $POD_NAME + + # Step 9: Fetch Network Information for Pod + - name: Fetch Network Info + if: always() + run: | + echo "Fetching network info for pod $POD_NAME" + kubectl exec $POD_NAME -n default -- ip a + + # Step 10: Check Docker Logs for Container + - name: Check Docker Logs for Container + if: always() + run: | + echo "Fetching Docker logs..." + CONTAINER_ID=$(docker ps | grep $POD_NAME | awk '{print $1}') + docker logs $CONTAINER_ID + + # Step 11: Check Kernel Logs for OOM/Resource Issues + - name: Check Kernel Logs for Resource Issues + if: always() + run: | + echo "Checking kernel logs for resource issues..." + dmesg | grep -i "oom\|kill" strategy: fail-fast: false matrix: - # Detail: https://hub.docker.com/r/kindest/node + # Kubernetes versions to test with kubernetes-version: ["v1.27.11", "v1.28.7", "v1.29.2"] From efffdc25abb46f9283cafe6ae72bd94d9b1dc34e Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Fri, 20 Sep 2024 23:47:19 -0700 Subject: [PATCH 38/57] set 'storage_config' Signed-off-by: helenxie-bit --- test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py index 6a125c3033a..5338b016101 100644 --- a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py +++ b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py @@ -176,6 +176,10 @@ def run_e2e_experiment_create_by_tune_with_external_model( num_procs_per_worker=1, resources_per_worker={"cpu": "2", "memory": "10G",}, ), + storage_config={ + "size": "10Gi", + "access_modes": ["ReadWriteOnce"], + }, ) experiment = katib_client.wait_for_experiment_condition( exp_name, exp_namespace, timeout=EXPERIMENT_TIMEOUT From 2a18b17b0fc4236ec5cb83404e2fa0323749f4e9 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Sat, 21 Sep 2024 20:49:34 -0700 Subject: [PATCH 39/57] reduce the number of tests Signed-off-by: helenxie-bit --- .github/workflows/e2e-test-tune-api.yaml | 39 +----------------------- 1 file changed, 1 insertion(+), 38 deletions(-) diff --git a/.github/workflows/e2e-test-tune-api.yaml b/.github/workflows/e2e-test-tune-api.yaml index 12aac4d2b36..d59679e0173 100644 --- a/.github/workflows/e2e-test-tune-api.yaml +++ b/.github/workflows/e2e-test-tune-api.yaml @@ -63,46 +63,9 @@ jobs: run: | echo "Fetching kubelet logs..." sudo journalctl -u kubelet - - # Step 7: Check Node Resource Usage - - name: Check Node Resource Usage - if: always() - run: | - echo "Checking node resource usage..." - NODE_NAME=$(kubectl get pods -n default -o jsonpath="{.items[0].spec.nodeName}") - kubectl top node $NODE_NAME - - # Step 8: Check Pod Resource Usage - - name: Check Pod Resource Usage - if: always() - run: | - echo "Checking pod resource usage..." - kubectl top pod -n default $POD_NAME - - # Step 9: Fetch Network Information for Pod - - name: Fetch Network Info - if: always() - run: | - echo "Fetching network info for pod $POD_NAME" - kubectl exec $POD_NAME -n default -- ip a - - # Step 10: Check Docker Logs for Container - - name: Check Docker Logs for Container - if: always() - run: | - echo "Fetching Docker logs..." - CONTAINER_ID=$(docker ps | grep $POD_NAME | awk '{print $1}') - docker logs $CONTAINER_ID - - # Step 11: Check Kernel Logs for OOM/Resource Issues - - name: Check Kernel Logs for Resource Issues - if: always() - run: | - echo "Checking kernel logs for resource issues..." - dmesg | grep -i "oom\|kill" strategy: fail-fast: false matrix: # Kubernetes versions to test with - kubernetes-version: ["v1.27.11", "v1.28.7", "v1.29.2"] + kubernetes-version: ["v1.29.2"] From c6c964bd95372f2a9c0588f3efb03d0460e9225d Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Sat, 21 Sep 2024 21:29:32 -0700 Subject: [PATCH 40/57] Check container runtime logs Signed-off-by: helenxie-bit --- .github/workflows/e2e-test-tune-api.yaml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.github/workflows/e2e-test-tune-api.yaml b/.github/workflows/e2e-test-tune-api.yaml index d59679e0173..115d7dc10ab 100644 --- a/.github/workflows/e2e-test-tune-api.yaml +++ b/.github/workflows/e2e-test-tune-api.yaml @@ -64,6 +64,12 @@ jobs: echo "Fetching kubelet logs..." sudo journalctl -u kubelet + - name: Check container runtime logs + if: always() # Run this step even if previous steps fail + run: | + echo "Checking container runtime logs..." + sudo journalctl -u docker + strategy: fail-fast: false matrix: From 28ffb96ae59c3bf60289b4fa9155ddbe628bb12e Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Sat, 21 Sep 2024 22:48:08 -0700 Subject: [PATCH 41/57] set the driver of minikube as docker Signed-off-by: helenxie-bit --- .github/workflows/template-setup-e2e-test/action.yaml | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/.github/workflows/template-setup-e2e-test/action.yaml b/.github/workflows/template-setup-e2e-test/action.yaml index 657113afc4d..69665858514 100644 --- a/.github/workflows/template-setup-e2e-test/action.yaml +++ b/.github/workflows/template-setup-e2e-test/action.yaml @@ -31,13 +31,6 @@ runs: echo "Disk usage after cleanup:" df -h - - name: Prune docker images - shell: bash - run: | - docker image prune -a -f - docker system df - df -hT - - name: Move docker data directory shell: bash run: | @@ -66,7 +59,7 @@ runs: with: network-plugin: cni cni: flannel - driver: none + driver: docker kubernetes-version: ${{ inputs.kubernetes-version }} minikube-version: 1.31.1 start-args: --wait-timeout=120s From dc684e30d0a32d9364cc8aafcb97bf6f85ffbfaa Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Sat, 21 Sep 2024 23:19:15 -0700 Subject: [PATCH 42/57] set the driver of minikube to none Signed-off-by: helenxie-bit --- .github/workflows/template-setup-e2e-test/action.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/template-setup-e2e-test/action.yaml b/.github/workflows/template-setup-e2e-test/action.yaml index 69665858514..93ac2e3fd4d 100644 --- a/.github/workflows/template-setup-e2e-test/action.yaml +++ b/.github/workflows/template-setup-e2e-test/action.yaml @@ -59,7 +59,7 @@ runs: with: network-plugin: cni cni: flannel - driver: docker + driver: none kubernetes-version: ${{ inputs.kubernetes-version }} minikube-version: 1.31.1 start-args: --wait-timeout=120s From a12034c749c322e1696f4de6c00aa8fd66bf00c4 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Tue, 24 Sep 2024 13:26:37 -0700 Subject: [PATCH 43/57] check logs of pod Signed-off-by: helenxie-bit --- .github/workflows/e2e-test-tune-api.yaml | 6 +- .../scripts/gh-actions/run-e2e-tune-api.py | 62 +------------------ 2 files changed, 6 insertions(+), 62 deletions(-) diff --git a/.github/workflows/e2e-test-tune-api.yaml b/.github/workflows/e2e-test-tune-api.yaml index 115d7dc10ab..01245fab0b5 100644 --- a/.github/workflows/e2e-test-tune-api.yaml +++ b/.github/workflows/e2e-test-tune-api.yaml @@ -51,10 +51,14 @@ jobs: - name: Fetch Experiment Pod Logs if: always() # Run this step even if previous steps fail run: | - echo "Fetching logs for experiment pod..." + echo "Fetching all the pods in the default namespace..." kubectl get pods -n default POD_NAME=$(kubectl get pods -n default --no-headers -o custom-columns=":metadata.name" | grep tune-example-2 | grep master) + echo "Fetching pod description for experiment pod..." kubectl describe pod $POD_NAME -n default + echo "Fetching logs for experiment pod..." + kubectl logs $POD_NAME -n default --all-containers + echo "Fetching events for experiment pod..." kubectl get events -n default | grep "tune-example-2" # Step 6: Fetch kubelet logs (requires sudo for accessing kubelet logs) diff --git a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py index 5338b016101..b13c19494fb 100644 --- a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py +++ b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py @@ -19,64 +19,6 @@ # The default logging config. logging.basicConfig(level=logging.INFO) - -def get_experiment_pods_logs(katib_client: KatibClient, exp_name: str, exp_namespace: str): - # List all the pods in the namespace - v1 = client.CoreV1Api() - pods = v1.list_namespaced_pod(namespace=exp_namespace) - - # Filter pods related to the specific Katib Experiment - for pod in pods.items: - if exp_name in pod.metadata.name: - logging.info(f"Fetching logs for pod: {pod.metadata.name}") - try: - # Specify the container name when retrieving logs - pod_logs1 = v1.read_namespaced_pod_log( - name=pod.metadata.name, - namespace=exp_namespace, - container="metrics-logger-and-collector" - ) - logging.info(f"Logs of metrics-logger-and-collector for pod {pod.metadata.name}:\n{pod_logs1}") - pod_logs2 = v1.read_namespaced_pod_log( - name=pod.metadata.name, - namespace=exp_namespace, - container="pytorch" - ) - logging.info(f"Logs of pytorch for pod {pod.metadata.name}:\n{pod_logs2}") - pod_logs3 = v1.read_namespaced_pod_log( - name=pod.metadata.name, - namespace=exp_namespace, - container="storage-initializer" - ) - logging.info(f"Logs of storage-initializer for pod {pod.metadata.name}:\n{pod_logs3}") - pod_logs4 = v1.read_namespaced_pod_log( - name=pod.metadata.name, - namespace=exp_namespace, - ) - logging.info(f"Logs for pod {pod.metadata.name}:\n{pod_logs4}") - except Exception as e: - logging.error(f"Failed to get logs for pod {pod.metadata.name}: {str(e)}") - -def get_experiment_pods_logs_2(katib_client: KatibClient, exp_name: str, exp_namespace: str): - # List all the pods in the namespace - v1 = client.CoreV1Api() - pods = v1.list_namespaced_pod(namespace=exp_namespace) - - # Filter pods related to the specific Katib Experiment - for pod in pods.items: - if exp_name in pod.metadata.name: - logging.info(f"Fetching logs for pod: {pod.metadata.name}") - try: - # Specify the container name when retrieving logs - pod_logs = v1.read_namespaced_pod_log( - name=pod.metadata.name, - namespace=exp_namespace, - ) - logging.info(f"Logs for pod {pod.metadata.name} (container: metrics-logger-and-collector):\n{pod_logs}") - except Exception as e: - logging.error(f"Failed to get logs for pod {pod.metadata.name}: {str(e)}") - - # Test for Experiment created with custom objective. def run_e2e_experiment_create_by_tune_with_custom_objective( katib_client: KatibClient, @@ -180,6 +122,7 @@ def run_e2e_experiment_create_by_tune_with_external_model( "size": "10Gi", "access_modes": ["ReadWriteOnce"], }, + retain_trials=True, ) experiment = katib_client.wait_for_experiment_condition( exp_name, exp_namespace, timeout=EXPERIMENT_TIMEOUT @@ -236,9 +179,6 @@ def run_e2e_experiment_create_by_tune_with_external_model( except Exception as e: logging.info("---------------------------------------------------------------") logging.info(f"E2E is failed for Experiment created by tune: {exp_namespace}/{exp_name}-2") - get_experiment_pods_logs(katib_client, f"{exp_name}-2", exp_namespace) - get_experiment_pods_logs_2(katib_client, "katib-controller", "kubeflow") - get_experiment_pods_logs_2(katib_client, "training-operator", "kubeflow") raise e finally: # Delete the Experiment. From b0888155794af54749b4cca94c67d1fb7f8b6d81 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Sun, 29 Sep 2024 10:50:20 -0700 Subject: [PATCH 44/57] check memory usage Signed-off-by: helenxie-bit --- .github/workflows/e2e-test-tune-api.yaml | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/.github/workflows/e2e-test-tune-api.yaml b/.github/workflows/e2e-test-tune-api.yaml index 01245fab0b5..7f3ceea9884 100644 --- a/.github/workflows/e2e-test-tune-api.yaml +++ b/.github/workflows/e2e-test-tune-api.yaml @@ -33,13 +33,32 @@ jobs: echo "Checking disk space usage before e2e test..." df -h # Run 'df' to check free disk space + - name: Monitor Memory Usage Before Run + if: always() + run: free -h + + - name: Monitor Docker Container Memory Usage + if: always() + run: | + docker stats --no-stream + # Step 3: Run e2e test with tune API - name: Run e2e test with tune API + if: always() uses: ./.github/workflows/template-e2e-test with: tune-api: true training-operator: true + - name: Monitor Memory Usage After Run + if: always() + run: free -h + + - name: Monitor Docker Container Memory Usage + if: always() + run: | + docker stats --no-stream + # Step 4: Check Disk Space After Test - name: Check Disk Space After Test if: always() # Run this step even if previous steps fail From e468b27a978ed94fd7396ead4037b76a9482d301 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Sun, 29 Sep 2024 11:59:38 -0700 Subject: [PATCH 45/57] increase 'termination_grace_period_seconds' in podspec Signed-off-by: helenxie-bit --- .github/workflows/e2e-test-tune-api.yaml | 2 ++ .../kubeflow/katib/api/katib_client.py | 26 ++++++++++++++----- 2 files changed, 21 insertions(+), 7 deletions(-) diff --git a/.github/workflows/e2e-test-tune-api.yaml b/.github/workflows/e2e-test-tune-api.yaml index 7f3ceea9884..b1c2c64dba0 100644 --- a/.github/workflows/e2e-test-tune-api.yaml +++ b/.github/workflows/e2e-test-tune-api.yaml @@ -30,6 +30,8 @@ jobs: # Step 2: Check Disk Space Before Test - name: Check Disk Space Before Test run: | + docker system prune -a + docker volume prune echo "Checking disk space usage before e2e test..." df -h # Run 'df' to check free disk space diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index 49c5d88e584..5db30c13387 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -656,15 +656,27 @@ class name in this argument. ), ) - worker_pod_template_spec = training_utils.get_pod_template_spec( - containers=[container_spec], - volumes=[storage_initializer_volume], + worker_pod_template_spec = models.V1PodTemplateSpec( + metadata=models.V1ObjectMeta( + annotations={constants.ISTIO_SIDECAR_INJECTION: "false"} + ), + spec=models.V1PodSpec( + containers=[container_spec], + volumes=[storage_initializer_volume], + termination_grace_period_seconds=60, + ), ) - master_pod_template_spec = training_utils.get_pod_template_spec( - containers=[container_spec], - init_containers=[init_container_spec], - volumes=[storage_initializer_volume], + master_pod_template_spec = models.V1PodTemplateSpec( + metadata=models.V1ObjectMeta( + annotations={constants.ISTIO_SIDECAR_INJECTION: "false"} + ), + spec=models.V1PodSpec( + init_containers=[init_container_spec], + containers=[container_spec], + volumes=[storage_initializer_volume], + termination_grace_period_seconds=60, + ), ) # Create PyTorchJob. From 64d8fef5062d6f4986028fe07de346da42bb16a2 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Sun, 29 Sep 2024 12:45:20 -0700 Subject: [PATCH 46/57] fix annotations error Signed-off-by: helenxie-bit --- sdk/python/v1beta1/kubeflow/katib/api/katib_client.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index 5db30c13387..cb95b20f31b 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -658,7 +658,7 @@ class name in this argument. worker_pod_template_spec = models.V1PodTemplateSpec( metadata=models.V1ObjectMeta( - annotations={constants.ISTIO_SIDECAR_INJECTION: "false"} + annotations={"sidecar.istio.io/inject": "false"} ), spec=models.V1PodSpec( containers=[container_spec], @@ -669,7 +669,7 @@ class name in this argument. master_pod_template_spec = models.V1PodTemplateSpec( metadata=models.V1ObjectMeta( - annotations={constants.ISTIO_SIDECAR_INJECTION: "false"} + annotations={"sidecar.istio.io/inject": "false"} ), spec=models.V1PodSpec( init_containers=[init_container_spec], From 45db42e73b955d04a21d0931fc41819c1ba2f9c6 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Sun, 29 Sep 2024 20:25:49 -0700 Subject: [PATCH 47/57] restart docker Signed-off-by: helenxie-bit --- .github/workflows/e2e-test-tune-api.yaml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/.github/workflows/e2e-test-tune-api.yaml b/.github/workflows/e2e-test-tune-api.yaml index b1c2c64dba0..282e1b402fe 100644 --- a/.github/workflows/e2e-test-tune-api.yaml +++ b/.github/workflows/e2e-test-tune-api.yaml @@ -43,6 +43,14 @@ jobs: if: always() run: | docker stats --no-stream + + - name: Restart Docker Service + run: | + echo "Restarting Docker service..." + sudo systemctl restart docker + echo "Docker service status:" + sudo systemctl --no-pager -l -o short status docker + kubectl get pods -n kubeflow # Step 3: Run e2e test with tune API - name: Run e2e test with tune API From c6e91cdf47a0a3596b7dea414be279cd1c9201a8 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Sun, 29 Sep 2024 20:45:38 -0700 Subject: [PATCH 48/57] delete restarting docker Signed-off-by: helenxie-bit --- .github/workflows/e2e-test-tune-api.yaml | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/.github/workflows/e2e-test-tune-api.yaml b/.github/workflows/e2e-test-tune-api.yaml index 282e1b402fe..f141d034e57 100644 --- a/.github/workflows/e2e-test-tune-api.yaml +++ b/.github/workflows/e2e-test-tune-api.yaml @@ -27,7 +27,6 @@ jobs: run: | pip install "kubeflow-training[huggingface]==1.8.1" - # Step 2: Check Disk Space Before Test - name: Check Disk Space Before Test run: | docker system prune -a @@ -43,16 +42,7 @@ jobs: if: always() run: | docker stats --no-stream - - - name: Restart Docker Service - run: | - echo "Restarting Docker service..." - sudo systemctl restart docker - echo "Docker service status:" - sudo systemctl --no-pager -l -o short status docker - kubectl get pods -n kubeflow - # Step 3: Run e2e test with tune API - name: Run e2e test with tune API if: always() uses: ./.github/workflows/template-e2e-test @@ -69,14 +59,12 @@ jobs: run: | docker stats --no-stream - # Step 4: Check Disk Space After Test - name: Check Disk Space After Test if: always() # Run this step even if previous steps fail run: | echo "Checking disk space usage after e2e test..." df -h # Run 'df' to check free disk space - # Step 5: Fetch Pod Logs for Relevant Experiment Pod - name: Fetch Experiment Pod Logs if: always() # Run this step even if previous steps fail run: | @@ -90,7 +78,6 @@ jobs: echo "Fetching events for experiment pod..." kubectl get events -n default | grep "tune-example-2" - # Step 6: Fetch kubelet logs (requires sudo for accessing kubelet logs) - name: Fetch Kubelet Logs if: always() # Run this step even if previous steps fail run: | From b1a2390b6542208b1dea824eed2f276a7f79dbd8 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Mon, 21 Oct 2024 23:30:57 -0700 Subject: [PATCH 49/57] use original docker data directory Signed-off-by: helenxie-bit --- .github/workflows/e2e-test-tune-api.yaml | 6 ++++++ .../template-setup-e2e-test/action.yaml | 18 ------------------ 2 files changed, 6 insertions(+), 18 deletions(-) diff --git a/.github/workflows/e2e-test-tune-api.yaml b/.github/workflows/e2e-test-tune-api.yaml index f141d034e57..9111f2f1982 100644 --- a/.github/workflows/e2e-test-tune-api.yaml +++ b/.github/workflows/e2e-test-tune-api.yaml @@ -50,6 +50,12 @@ jobs: tune-api: true training-operator: true + - name: Get YAML file of Experiment + if: always() + run: | + echo "Fetching the YAML file of the experiment..." + kubectl get experiment tune-example-2 -n default -o yaml + - name: Monitor Memory Usage After Run if: always() run: free -h diff --git a/.github/workflows/template-setup-e2e-test/action.yaml b/.github/workflows/template-setup-e2e-test/action.yaml index 93ac2e3fd4d..75ee040aea2 100644 --- a/.github/workflows/template-setup-e2e-test/action.yaml +++ b/.github/workflows/template-setup-e2e-test/action.yaml @@ -30,24 +30,6 @@ runs: echo "Disk usage after cleanup:" df -h - - - name: Move docker data directory - shell: bash - run: | - echo "Stopping docker service ..." - sudo systemctl stop docker - DOCKER_DEFAULT_ROOT_DIR=/var/lib/docker - DOCKER_ROOT_DIR=/mnt/docker - echo "Moving ${DOCKER_DEFAULT_ROOT_DIR} -> ${DOCKER_ROOT_DIR}" - sudo mv ${DOCKER_DEFAULT_ROOT_DIR} ${DOCKER_ROOT_DIR} - echo "Creating symlink ${DOCKER_DEFAULT_ROOT_DIR} -> ${DOCKER_ROOT_DIR}" - sudo ln -s ${DOCKER_ROOT_DIR} ${DOCKER_DEFAULT_ROOT_DIR} - echo "$(sudo ls -l ${DOCKER_DEFAULT_ROOT_DIR})" - echo "Starting docker service ..." - sudo systemctl daemon-reload - sudo systemctl start docker - echo "Docker service status:" - sudo systemctl --no-pager -l -o short status docker - name: Setup kubectl uses: azure/setup-kubectl@v4 From e5bf8401990835778e9cab7e75f194f7aa54551c Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Thu, 23 Jan 2025 00:14:50 -0800 Subject: [PATCH 50/57] update installation of Katib SDK with extra requires Signed-off-by: helenxie-bit --- .github/workflows/e2e-test-tune-api.yaml | 8 +-- .../kubeflow/katib/api/katib_client.py | 49 ++++++++++--------- 2 files changed, 30 insertions(+), 27 deletions(-) diff --git a/.github/workflows/e2e-test-tune-api.yaml b/.github/workflows/e2e-test-tune-api.yaml index 9111f2f1982..86cb786647f 100644 --- a/.github/workflows/e2e-test-tune-api.yaml +++ b/.github/workflows/e2e-test-tune-api.yaml @@ -22,10 +22,10 @@ jobs: with: kubernetes-version: ${{ matrix.kubernetes-version }} - - name: Install Training Operator SDK + - name: Install Katib SDK with extra requires shell: bash run: | - pip install "kubeflow-training[huggingface]==1.8.1" + pip install --prefer-binary -e 'sdk/python/v1beta1[huggingface]' - name: Check Disk Space Before Test run: | @@ -99,5 +99,5 @@ jobs: strategy: fail-fast: false matrix: - # Kubernetes versions to test with - kubernetes-version: ["v1.29.2"] + # Detail: https://hub.docker.com/r/kindest/node + kubernetes-version: ["v1.29.2", "v1.30.7", "v1.31.3"] diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index cb95b20f31b..e5d19da8256 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -415,7 +415,9 @@ class name in this argument. experiment.spec.max_failed_trial_count = max_failed_trial_count # If users choose to use a custom objective function. - if objective is not None: + if objective is not None or parameters is not None: + if not objective or not parameters: + raise ValueError("One of the required parameters is None") # Add metrics collector to the Katib Experiment. # Up to now, we only support parameter `kind`, of which default value # is `StdOut`, to specify the kind of metrics collector. @@ -518,6 +520,7 @@ class name in this argument. from kubeflow.storage_initializer.hugging_face import ( HuggingFaceDatasetParams, HuggingFaceModelParams, + HuggingFaceTrainerParams, ) from kubeflow.storage_initializer.s3 import S3DatasetParams from kubeflow.training import models as training_models @@ -596,6 +599,11 @@ class name in this argument. "or HuggingFaceDatasetParams." ) + if not isinstance(trainer_parameters, HuggingFaceTrainerParams): + raise ValueError( + "Trainer parameters must be an instance of HuggingFaceTrainerParams." + ) + # Iterate over input parameters and do substitutions. experiment_params = [] trial_params = [] @@ -645,7 +653,11 @@ class name in this argument. f"'{training_args}'", ], volume_mounts=[STORAGE_INITIALIZER_VOLUME_MOUNT], - resources=resources_per_trial.resources_per_worker, + resources=( + resources_per_trial.resources_per_worker + if resources_per_trial + else None + ), ) # Create the worker and the master pod. @@ -656,27 +668,15 @@ class name in this argument. ), ) - worker_pod_template_spec = models.V1PodTemplateSpec( - metadata=models.V1ObjectMeta( - annotations={"sidecar.istio.io/inject": "false"} - ), - spec=models.V1PodSpec( - containers=[container_spec], - volumes=[storage_initializer_volume], - termination_grace_period_seconds=60, - ), + worker_pod_template_spec = training_utils.get_pod_template_spec( + containers=[container_spec], + volumes=[storage_initializer_volume], ) - master_pod_template_spec = models.V1PodTemplateSpec( - metadata=models.V1ObjectMeta( - annotations={"sidecar.istio.io/inject": "false"} - ), - spec=models.V1PodSpec( - init_containers=[init_container_spec], - containers=[container_spec], - volumes=[storage_initializer_volume], - termination_grace_period_seconds=60, - ), + master_pod_template_spec = training_utils.get_pod_template_spec( + containers=[container_spec], + init_containers=[init_container_spec], + volumes=[storage_initializer_volume], ) # Create PyTorchJob. @@ -691,7 +691,10 @@ class name in this argument. ), ) - if resources_per_trial.num_procs_per_worker: + if ( + resources_per_trial is not None + and resources_per_trial.num_procs_per_worker + ): pytorchjob.spec.nproc_per_node = str( resources_per_trial.num_procs_per_worker ) @@ -703,7 +706,7 @@ class name in this argument. ) ) - if resources_per_trial.num_workers > 1: + if resources_per_trial is not None and resources_per_trial.num_workers > 1: pytorchjob.spec.pytorch_replica_specs["Worker"] = ( training_models.KubeflowOrgV1ReplicaSpec( replicas=resources_per_trial.num_workers - 1, From fca94ae148b2f4504dec209d84158e9f25e62df5 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Thu, 23 Jan 2025 00:55:06 -0800 Subject: [PATCH 51/57] test trainer image built with cpu Signed-off-by: helenxie-bit --- sdk/python/v1beta1/kubeflow/katib/api/katib_client.py | 2 +- test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index e5d19da8256..cbd4f80d1f2 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -635,7 +635,7 @@ class name in this argument. container_spec = training_utils.get_container_spec( name=JOB_PARAMETERS[PYTORCHJOB_KIND]["container"], - base_image=TRAINER_TRANSFORMER_IMAGE, + base_image="docker.io/helenxiehz428/trainer:test", args=[ "--model_uri", model_provider_parameters.model_uri, diff --git a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py index b13c19494fb..6ee1b19d68b 100644 --- a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py +++ b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py @@ -14,7 +14,7 @@ from verify import verify_experiment_results # Experiment timeout is 40 min. -EXPERIMENT_TIMEOUT = 60 * 10 +EXPERIMENT_TIMEOUT = 60 * 15 # The default logging config. logging.basicConfig(level=logging.INFO) From a785d353a1feb1f3959a2bbd306269585ed2d207 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Fri, 24 Jan 2025 14:42:59 -0800 Subject: [PATCH 52/57] add action of free up disk space (including move docker data directory) Signed-off-by: helenxie-bit --- .github/workflows/e2e-test-tune-api.yaml | 24 --------- .../workflows/free-up-disk-space/action.yaml | 49 +++++++++++++++++++ .../template-setup-e2e-test/action.yaml | 15 +----- .../kubeflow/katib/api/katib_client.py | 2 +- 4 files changed, 52 insertions(+), 38 deletions(-) create mode 100644 .github/workflows/free-up-disk-space/action.yaml diff --git a/.github/workflows/e2e-test-tune-api.yaml b/.github/workflows/e2e-test-tune-api.yaml index 86cb786647f..54550b03542 100644 --- a/.github/workflows/e2e-test-tune-api.yaml +++ b/.github/workflows/e2e-test-tune-api.yaml @@ -34,36 +34,12 @@ jobs: echo "Checking disk space usage before e2e test..." df -h # Run 'df' to check free disk space - - name: Monitor Memory Usage Before Run - if: always() - run: free -h - - - name: Monitor Docker Container Memory Usage - if: always() - run: | - docker stats --no-stream - - name: Run e2e test with tune API if: always() uses: ./.github/workflows/template-e2e-test with: tune-api: true training-operator: true - - - name: Get YAML file of Experiment - if: always() - run: | - echo "Fetching the YAML file of the experiment..." - kubectl get experiment tune-example-2 -n default -o yaml - - - name: Monitor Memory Usage After Run - if: always() - run: free -h - - - name: Monitor Docker Container Memory Usage - if: always() - run: | - docker stats --no-stream - name: Check Disk Space After Test if: always() # Run this step even if previous steps fail diff --git a/.github/workflows/free-up-disk-space/action.yaml b/.github/workflows/free-up-disk-space/action.yaml new file mode 100644 index 00000000000..110e3a21b84 --- /dev/null +++ b/.github/workflows/free-up-disk-space/action.yaml @@ -0,0 +1,49 @@ +name: Free-Up Disk Space +description: Remove Non-Essential Tools And Move Docker Data Directory to /mnt/docker + +runs: + using: composite + steps: + # This step is a Workaround to avoid the "No space left on device" error. + # ref: https://github.com/actions/runner-images/issues/2840 + - name: Remove unnecessary files + shell: bash + run: | + echo "Disk usage before cleanup:" + df -hT + + sudo rm -rf /usr/share/dotnet + sudo rm -rf /opt/ghc + sudo rm -rf /usr/local/share/boost + sudo rm -rf "$AGENT_TOOLSDIRECTORY" + sudo rm -rf /usr/local/lib/android + sudo rm -rf /usr/local/share/powershell + sudo rm -rf /usr/share/swift + + echo "Disk usage after cleanup:" + df -hT + + - name: Prune docker images + shell: bash + run: | + docker image prune -a -f + docker system df + df -hT + + - name: Move docker data directory + shell: bash + run: | + echo "Stopping docker service ..." + sudo systemctl stop docker + DOCKER_DEFAULT_ROOT_DIR=/var/lib/docker + DOCKER_ROOT_DIR=/mnt/docker + echo "Moving ${DOCKER_DEFAULT_ROOT_DIR} -> ${DOCKER_ROOT_DIR}" + sudo mv ${DOCKER_DEFAULT_ROOT_DIR} ${DOCKER_ROOT_DIR} + echo "Creating symlink ${DOCKER_DEFAULT_ROOT_DIR} -> ${DOCKER_ROOT_DIR}" + sudo ln -s ${DOCKER_ROOT_DIR} ${DOCKER_DEFAULT_ROOT_DIR} + echo "$(sudo ls -l ${DOCKER_DEFAULT_ROOT_DIR})" + echo "Starting docker service ..." + sudo systemctl daemon-reload + sudo systemctl start docker + echo "Docker service status:" + sudo systemctl --no-pager -l -o short status docker \ No newline at end of file diff --git a/.github/workflows/template-setup-e2e-test/action.yaml b/.github/workflows/template-setup-e2e-test/action.yaml index 561f127648a..f85697fe3d5 100644 --- a/.github/workflows/template-setup-e2e-test/action.yaml +++ b/.github/workflows/template-setup-e2e-test/action.yaml @@ -17,19 +17,8 @@ runs: steps: # This step is a Workaround to avoid the "No space left on device" error. # ref: https://github.com/actions/runner-images/issues/2840 - - name: Remove unnecessary files - shell: bash - run: | - sudo rm -rf /usr/share/dotnet - sudo rm -rf /opt/ghc - sudo rm -rf "/usr/local/share/boost" - sudo rm -rf "$AGENT_TOOLSDIRECTORY" - sudo rm -rf /usr/local/lib/android - sudo rm -rf /usr/local/share/powershell - sudo rm -rf /usr/share/swift - - echo "Disk usage after cleanup:" - df -h + - name: Free-Up Disk Space + uses: ./.github/workflows/free-up-disk-space - name: Setup kubectl uses: azure/setup-kubectl@v4 diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index 82a3712cda1..b641800290f 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -635,7 +635,7 @@ class name in this argument. container_spec = training_utils.get_container_spec( name=JOB_PARAMETERS[PYTORCHJOB_KIND]["container"], - base_image="docker.io/helenxiehz428/trainer:test", + base_image=TRAINER_TRANSFORMER_IMAGE, args=[ "--model_uri", model_provider_parameters.model_uri, From 865379e37ec5200a65593a61f1cf1aedf79d9940 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Fri, 24 Jan 2025 15:59:59 -0800 Subject: [PATCH 53/57] delete unnecessary checks and update the part of fetching pod description and logs Signed-off-by: helenxie-bit --- .github/workflows/e2e-test-tune-api.yaml | 57 ++++++++----------- .../scripts/gh-actions/run-e2e-tune-api.py | 14 +---- 2 files changed, 25 insertions(+), 46 deletions(-) diff --git a/.github/workflows/e2e-test-tune-api.yaml b/.github/workflows/e2e-test-tune-api.yaml index 54550b03542..753802788bb 100644 --- a/.github/workflows/e2e-test-tune-api.yaml +++ b/.github/workflows/e2e-test-tune-api.yaml @@ -27,50 +27,39 @@ jobs: run: | pip install --prefer-binary -e 'sdk/python/v1beta1[huggingface]' - - name: Check Disk Space Before Test - run: | - docker system prune -a - docker volume prune - echo "Checking disk space usage before e2e test..." - df -h # Run 'df' to check free disk space - - name: Run e2e test with tune API - if: always() uses: ./.github/workflows/template-e2e-test with: tune-api: true training-operator: true - - - name: Check Disk Space After Test - if: always() # Run this step even if previous steps fail - run: | - echo "Checking disk space usage after e2e test..." - df -h # Run 'df' to check free disk space - - name: Fetch Experiment Pod Logs - if: always() # Run this step even if previous steps fail + - name: Fetch Pod Description and Logs for Experiment # This step is added to debug the test failure + if: always() run: | - echo "Fetching all the pods in the default namespace..." + echo "Fetching all the pods..." kubectl get pods -n default - POD_NAME=$(kubectl get pods -n default --no-headers -o custom-columns=":metadata.name" | grep tune-example-2 | grep master) - echo "Fetching pod description for experiment pod..." - kubectl describe pod $POD_NAME -n default - echo "Fetching logs for experiment pod..." - kubectl logs $POD_NAME -n default --all-containers - echo "Fetching events for experiment pod..." - kubectl get events -n default | grep "tune-example-2" - - - name: Fetch Kubelet Logs - if: always() # Run this step even if previous steps fail - run: | - echo "Fetching kubelet logs..." - sudo journalctl -u kubelet - - name: Check container runtime logs - if: always() # Run this step even if previous steps fail + POD_NAME_1=$(kubectl get pods -n default --no-headers -o custom-columns=":metadata.name" | grep tune-example-1 | grep master) + + echo "Fetching pod description for tune-example-1..." + kubectl describe pod $POD_NAME_1 -n default + + echo "Fetching pod logs for tune-example-1..." + kubectl logs $POD_NAME_1 -n default --all-containers + + POD_NAME_2=$(kubectl get pods -n default --no-headers -o custom-columns=":metadata.name" | grep tune-example-2 | grep master) + + echo "Fetching pod description for tune-example-2..." + kubectl describe pod $POD_NAME_2 -n default + + echo "Fetching pod logs for tune-example-2..." + kubectl logs $POD_NAME_2 -n default --all-containers + + - name: Delete Experiment for e2e test + if: always() run: | - echo "Checking container runtime logs..." - sudo journalctl -u docker + kubectl delete experiment tune-example-1 -n default + kubectl delete experiment tune-example-2 -n default strategy: fail-fast: false diff --git a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py index 6ee1b19d68b..7a98a93c20e 100644 --- a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py +++ b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py @@ -14,12 +14,12 @@ from verify import verify_experiment_results # Experiment timeout is 40 min. -EXPERIMENT_TIMEOUT = 60 * 15 +EXPERIMENT_TIMEOUT = 60 * 40 # The default logging config. logging.basicConfig(level=logging.INFO) -# Test for Experiment created with custom objective. +# Test for Experiment created with custom objective function. def run_e2e_experiment_create_by_tune_with_custom_objective( katib_client: KatibClient, exp_name: str, @@ -166,11 +166,6 @@ def run_e2e_experiment_create_by_tune_with_external_model( logging.info("---------------------------------------------------------------") logging.info(f"E2E is failed for Experiment created by tune: {exp_namespace}/{exp_name}-1") raise e - finally: - # Delete the Experiment. - logging.info("---------------------------------------------------------------") - logging.info("---------------------------------------------------------------") - #katib_client.delete_experiment(f"{exp_name}-1", exp_namespace) try: run_e2e_experiment_create_by_tune_with_external_model(katib_client, f"{exp_name}-2", exp_namespace) @@ -180,8 +175,3 @@ def run_e2e_experiment_create_by_tune_with_external_model( logging.info("---------------------------------------------------------------") logging.info(f"E2E is failed for Experiment created by tune: {exp_namespace}/{exp_name}-2") raise e - finally: - # Delete the Experiment. - logging.info("---------------------------------------------------------------") - logging.info("---------------------------------------------------------------") - #katib_client.delete_experiment(f"{exp_name}-2", exp_namespace) From d1ea629f77fd1d0228a3b35a125fd973ce1f7db6 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Fri, 24 Jan 2025 16:27:37 -0800 Subject: [PATCH 54/57] delete fetching pod logs Signed-off-by: helenxie-bit --- .github/workflows/e2e-test-tune-api.yaml | 28 ------------------- .../scripts/gh-actions/run-e2e-tune-api.py | 10 +++++++ 2 files changed, 10 insertions(+), 28 deletions(-) diff --git a/.github/workflows/e2e-test-tune-api.yaml b/.github/workflows/e2e-test-tune-api.yaml index 753802788bb..186a0983b81 100644 --- a/.github/workflows/e2e-test-tune-api.yaml +++ b/.github/workflows/e2e-test-tune-api.yaml @@ -33,34 +33,6 @@ jobs: tune-api: true training-operator: true - - name: Fetch Pod Description and Logs for Experiment # This step is added to debug the test failure - if: always() - run: | - echo "Fetching all the pods..." - kubectl get pods -n default - - POD_NAME_1=$(kubectl get pods -n default --no-headers -o custom-columns=":metadata.name" | grep tune-example-1 | grep master) - - echo "Fetching pod description for tune-example-1..." - kubectl describe pod $POD_NAME_1 -n default - - echo "Fetching pod logs for tune-example-1..." - kubectl logs $POD_NAME_1 -n default --all-containers - - POD_NAME_2=$(kubectl get pods -n default --no-headers -o custom-columns=":metadata.name" | grep tune-example-2 | grep master) - - echo "Fetching pod description for tune-example-2..." - kubectl describe pod $POD_NAME_2 -n default - - echo "Fetching pod logs for tune-example-2..." - kubectl logs $POD_NAME_2 -n default --all-containers - - - name: Delete Experiment for e2e test - if: always() - run: | - kubectl delete experiment tune-example-1 -n default - kubectl delete experiment tune-example-2 -n default - strategy: fail-fast: false matrix: diff --git a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py index 7a98a93c20e..9e2cb732343 100644 --- a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py +++ b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py @@ -166,6 +166,11 @@ def run_e2e_experiment_create_by_tune_with_external_model( logging.info("---------------------------------------------------------------") logging.info(f"E2E is failed for Experiment created by tune: {exp_namespace}/{exp_name}-1") raise e + finally: + # Delete the Experiment. + logging.info("---------------------------------------------------------------") + logging.info("---------------------------------------------------------------") + katib_client.delete_experiment(f"{exp_name}-1", exp_namespace) try: run_e2e_experiment_create_by_tune_with_external_model(katib_client, f"{exp_name}-2", exp_namespace) @@ -175,3 +180,8 @@ def run_e2e_experiment_create_by_tune_with_external_model( logging.info("---------------------------------------------------------------") logging.info(f"E2E is failed for Experiment created by tune: {exp_namespace}/{exp_name}-2") raise e + finally: + # Delete the Experiment. + logging.info("---------------------------------------------------------------") + logging.info("---------------------------------------------------------------") + katib_client.delete_experiment(f"{exp_name}-2", exp_namespace) From 5e2e44f9c869d0163c6fb395222489332def62f7 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Mon, 27 Jan 2025 09:55:11 -0800 Subject: [PATCH 55/57] add blank line at the end of free-up-disk-space yaml file Signed-off-by: helenxie-bit --- .github/workflows/free-up-disk-space/action.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/free-up-disk-space/action.yaml b/.github/workflows/free-up-disk-space/action.yaml index 110e3a21b84..c85e44e8c58 100644 --- a/.github/workflows/free-up-disk-space/action.yaml +++ b/.github/workflows/free-up-disk-space/action.yaml @@ -46,4 +46,4 @@ runs: sudo systemctl daemon-reload sudo systemctl start docker echo "Docker service status:" - sudo systemctl --no-pager -l -o short status docker \ No newline at end of file + sudo systemctl --no-pager -l -o short status docker From 982e2687d4e1de5e06fbdf563863107ef81a2a66 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Mon, 27 Jan 2025 09:57:25 -0800 Subject: [PATCH 56/57] update experiment name Signed-off-by: helenxie-bit --- .../scripts/gh-actions/run-e2e-tune-api.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py index 9e2cb732343..aaa6e074d56 100644 --- a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py +++ b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py @@ -156,32 +156,33 @@ def run_e2e_experiment_create_by_tune_with_external_model( client.CoreV1Api().patch_namespace(args.namespace, {'metadata': {'labels': namespace_labels}}) # Test with run_e2e_experiment_create_by_tune - exp_name = "tune-example" + exp_name_custom_objective = "tune-example-1" + exp_name_llm_optimization = "tune-example-2" exp_namespace = args.namespace try: - run_e2e_experiment_create_by_tune_with_custom_objective(katib_client, f"{exp_name}-1", exp_namespace) + run_e2e_experiment_create_by_tune_with_custom_objective(katib_client, exp_name_custom_objective, exp_namespace) logging.info("---------------------------------------------------------------") - logging.info(f"E2E is succeeded for Experiment created by tune: {exp_namespace}/{exp_name}-1") + logging.info(f"E2E is succeeded for Experiment created by tune: {exp_namespace}/{exp_name_custom_objective}") except Exception as e: logging.info("---------------------------------------------------------------") - logging.info(f"E2E is failed for Experiment created by tune: {exp_namespace}/{exp_name}-1") + logging.info(f"E2E is failed for Experiment created by tune: {exp_namespace}/{exp_name_custom_objective}") raise e finally: # Delete the Experiment. logging.info("---------------------------------------------------------------") logging.info("---------------------------------------------------------------") - katib_client.delete_experiment(f"{exp_name}-1", exp_namespace) + katib_client.delete_experiment(exp_name_custom_objective, exp_namespace) try: - run_e2e_experiment_create_by_tune_with_external_model(katib_client, f"{exp_name}-2", exp_namespace) + run_e2e_experiment_create_by_tune_with_external_model(katib_client, exp_name_llm_optimization, exp_namespace) logging.info("---------------------------------------------------------------") - logging.info(f"E2E is succeeded for Experiment created by tune: {exp_namespace}/{exp_name}-2") + logging.info(f"E2E is succeeded for Experiment created by tune: {exp_namespace}/{exp_name_llm_optimization}") except Exception as e: logging.info("---------------------------------------------------------------") - logging.info(f"E2E is failed for Experiment created by tune: {exp_namespace}/{exp_name}-2") + logging.info(f"E2E is failed for Experiment created by tune: {exp_namespace}/{exp_name_llm_optimization}") raise e finally: # Delete the Experiment. logging.info("---------------------------------------------------------------") logging.info("---------------------------------------------------------------") - katib_client.delete_experiment(f"{exp_name}-2", exp_namespace) + katib_client.delete_experiment(exp_name_llm_optimization, exp_namespace) From 55c404d691276695b08ccb24c84fbc04fb0be66f Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Mon, 27 Jan 2025 09:59:01 -0800 Subject: [PATCH 57/57] update test function name to be consistent with experiment name Signed-off-by: helenxie-bit --- test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py index aaa6e074d56..b9302d4f8fa 100644 --- a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py +++ b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py @@ -66,7 +66,7 @@ def objective(parameters): logging.debug(katib_client.get_suggestion(exp_name, exp_namespace)) # Test for Experiment created with external models and datasets. -def run_e2e_experiment_create_by_tune_with_external_model( +def run_e2e_experiment_create_by_tune_with_llm_optimization( katib_client: KatibClient, exp_name: str, exp_namespace: str, @@ -174,7 +174,7 @@ def run_e2e_experiment_create_by_tune_with_external_model( katib_client.delete_experiment(exp_name_custom_objective, exp_namespace) try: - run_e2e_experiment_create_by_tune_with_external_model(katib_client, exp_name_llm_optimization, exp_namespace) + run_e2e_experiment_create_by_tune_with_llm_optimization(katib_client, exp_name_llm_optimization, exp_namespace) logging.info("---------------------------------------------------------------") logging.info(f"E2E is succeeded for Experiment created by tune: {exp_namespace}/{exp_name_llm_optimization}") except Exception as e: