From 6be7f29582a114cc01b6e92b485f04c4dfddd5c1 Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Tue, 3 Sep 2024 21:10:31 +0800
Subject: [PATCH 01/57] add e2e test for tune api

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 .github/workflows/e2e-test-tune-api.yaml      |  5 +
 .../scripts/gh-actions/run-e2e-tune-api.py    | 92 ++++++++++++++++++-
 2 files changed, 94 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/e2e-test-tune-api.yaml b/.github/workflows/e2e-test-tune-api.yaml
index e1f37a3701b..6ac0c6e0dfc 100644
--- a/.github/workflows/e2e-test-tune-api.yaml
+++ b/.github/workflows/e2e-test-tune-api.yaml
@@ -22,10 +22,15 @@ jobs:
         with:
           kubernetes-version: ${{ matrix.kubernetes-version }}
       
+      - name: Install Training Operator SDK
+        shell: bash
+        run: pip install kubeflow-training[huggingface]
+      
       - name: Run e2e test with tune API
         uses: ./.github/workflows/template-e2e-test
         with:
           tune-api: true
+          training-operator: true
 
     strategy:
       fail-fast: false
diff --git a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py
index c9d1cb2ee43..944def8cd36 100644
--- a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py
+++ b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py
@@ -1,8 +1,15 @@
 import argparse
 import logging
 
-from kubeflow.katib import KatibClient, search
+import transformers
+from kubeflow.katib import KatibClient, search, types
+from kubeflow.storage_initializer.hugging_face import (
+    HuggingFaceDatasetParams,
+    HuggingFaceModelParams,
+    HuggingFaceTrainerParams,
+)
 from kubernetes import client
+from peft import LoraConfig
 from verify import verify_experiment_results
 
 # Experiment timeout is 40 min.
@@ -12,7 +19,8 @@
 logging.basicConfig(level=logging.INFO)
 
 
-def run_e2e_experiment_create_by_tune(
+# Test for Experiment created with custom objective.
+def run_e2e_experiment_create_by_tune_with_custom_objective(
     katib_client: KatibClient,
     exp_name: str,
     exp_namespace: str,
@@ -57,6 +65,70 @@ def objective(parameters):
     logging.debug(katib_client.get_experiment(exp_name, exp_namespace))
     logging.debug(katib_client.get_suggestion(exp_name, exp_namespace))
 
+# Test for Experiment created with external models and datasets.
+def run_e2e_experiment_create_by_tune_with_external_model(
+    katib_client: KatibClient,
+    exp_name: str,
+    exp_namespace: str,
+):
+    # Create Katib Experiment and wait until it is finished.
+    logging.debug("Creating Experiment: {}/{}".format(exp_namespace, exp_name))
+    
+    # Use the test case from fine-tuning API tutorial.
+    # https://www.kubeflow.org/docs/components/training/user-guides/fine-tuning/
+    # Create Katib Experiment.
+    # And Wait until Experiment reaches Succeeded condition.
+    katib_client.tune(
+        name=exp_name,
+        namespace=exp_namespace,
+        # BERT model URI and type of Transformer to train it.
+        model_provider_parameters=HuggingFaceModelParams(
+            model_uri="hf://google-bert/bert-base-cased",
+            transformer_type=transformers.AutoModelForSequenceClassification,
+            num_labels=5,
+        ),
+        # In order to save test time, use 8 samples from Yelp dataset.
+        dataset_provider_parameters=HuggingFaceDatasetParams(
+            repo_id="yelp_review_full",
+            split="train[:8]",
+        ),
+        # Specify HuggingFace Trainer parameters.
+        trainer_parameters=HuggingFaceTrainerParams(
+            training_parameters=transformers.TrainingArguments(
+                output_dir="test_tune_api",
+                save_strategy="no",
+                learning_rate = search.double(min=1e-05, max=5e-05),
+                num_train_epochs=1,
+            ),
+            # Set LoRA config to reduce number of trainable model parameters.
+            lora_config=LoraConfig(
+                r = search.int(min=8, max=32),
+                lora_alpha=8,
+                lora_dropout=0.1,
+                bias="none",
+            ),
+        ),
+        objective_metric_name = "train_loss", 
+        objective_type = "minimize", 
+        algorithm_name = "random",
+        max_trial_count = 1,
+        parallel_trial_count = 1,
+        resources_per_trial=types.TrainerResources(
+            num_workers=1,
+            num_procs_per_worker=1,
+            resources_per_worker={"cpu": "2", "memory": "10G",},
+        ),
+    )
+    experiment = katib_client.wait_for_experiment_condition(
+        exp_name, exp_namespace, timeout=EXPERIMENT_TIMEOUT
+    )
+
+    # Verify the Experiment results.
+    verify_experiment_results(katib_client, experiment, exp_name, exp_namespace)
+
+    # Print the Experiment and Suggestion.
+    logging.debug(katib_client.get_experiment(exp_name, exp_namespace))
+    logging.debug(katib_client.get_suggestion(exp_name, exp_namespace))
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
@@ -82,7 +154,21 @@ def objective(parameters):
     exp_name = "tune-example"
     exp_namespace = args.namespace
     try:
-        run_e2e_experiment_create_by_tune(katib_client, exp_name, exp_namespace)
+        run_e2e_experiment_create_by_tune_with_custom_objective(katib_client, exp_name, exp_namespace)
+        logging.info("---------------------------------------------------------------")
+        logging.info(f"E2E is succeeded for Experiment created by tune: {exp_namespace}/{exp_name}")
+    except Exception as e:
+        logging.info("---------------------------------------------------------------")
+        logging.info(f"E2E is failed for Experiment created by tune: {exp_namespace}/{exp_name}")
+        raise e
+    finally:
+        # Delete the Experiment.
+        logging.info("---------------------------------------------------------------")
+        logging.info("---------------------------------------------------------------")
+        katib_client.delete_experiment(exp_name, exp_namespace)
+    
+    try:
+        run_e2e_experiment_create_by_tune_with_external_model(katib_client, exp_name, exp_namespace)
         logging.info("---------------------------------------------------------------")
         logging.info(f"E2E is succeeded for Experiment created by tune: {exp_namespace}/{exp_name}")
     except Exception as e:

From 1a1f119a1ff5c9f49eaa3f5b2f9b23d10a1fa1aa Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Tue, 3 Sep 2024 21:38:10 +0800
Subject: [PATCH 02/57] upgrade training-operator sdk

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 .github/workflows/e2e-test-tune-api.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/e2e-test-tune-api.yaml b/.github/workflows/e2e-test-tune-api.yaml
index 6ac0c6e0dfc..2c45d75f99b 100644
--- a/.github/workflows/e2e-test-tune-api.yaml
+++ b/.github/workflows/e2e-test-tune-api.yaml
@@ -24,7 +24,7 @@ jobs:
       
       - name: Install Training Operator SDK
         shell: bash
-        run: pip install kubeflow-training[huggingface]
+        run: pip install -U kubeflow-training[huggingface]
       
       - name: Run e2e test with tune API
         uses: ./.github/workflows/template-e2e-test

From 8461a49230b240dea4cfaaf0280bbe356d091385 Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Tue, 3 Sep 2024 21:50:06 +0800
Subject: [PATCH 03/57] specify the version of training operator sdk

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 .github/workflows/e2e-test-tune-api.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/e2e-test-tune-api.yaml b/.github/workflows/e2e-test-tune-api.yaml
index 2c45d75f99b..c631ad9420b 100644
--- a/.github/workflows/e2e-test-tune-api.yaml
+++ b/.github/workflows/e2e-test-tune-api.yaml
@@ -24,7 +24,7 @@ jobs:
       
       - name: Install Training Operator SDK
         shell: bash
-        run: pip install -U kubeflow-training[huggingface]
+        run: pip install kubeflow-training[huggingface]==1.8.0
       
       - name: Run e2e test with tune API
         uses: ./.github/workflows/template-e2e-test

From c860238525aba14dcb57e207e1449af7ade0735b Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Tue, 3 Sep 2024 22:16:45 +0800
Subject: [PATCH 04/57] fix num_labels error and update the version of training
 operator controller

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 .github/workflows/e2e-test-tune-api.yaml              | 2 +-
 sdk/python/v1beta1/kubeflow/katib/api/katib_client.py | 2 ++
 test/e2e/v1beta1/scripts/gh-actions/setup-katib.sh    | 2 +-
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/e2e-test-tune-api.yaml b/.github/workflows/e2e-test-tune-api.yaml
index c631ad9420b..2c45d75f99b 100644
--- a/.github/workflows/e2e-test-tune-api.yaml
+++ b/.github/workflows/e2e-test-tune-api.yaml
@@ -24,7 +24,7 @@ jobs:
       
       - name: Install Training Operator SDK
         shell: bash
-        run: pip install kubeflow-training[huggingface]==1.8.0
+        run: pip install -U kubeflow-training[huggingface]
       
       - name: Run e2e test with tune API
         uses: ./.github/workflows/template-e2e-test
diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py
index 05fd1405a3f..49c5d88e584 100644
--- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py
+++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py
@@ -633,6 +633,8 @@ class name in this argument.
                     model_provider_parameters.model_uri,
                     "--transformer_type",
                     model_provider_parameters.transformer_type.__name__,
+                    "--num_labels",
+                    str(model_provider_parameters.num_labels),
                     "--model_dir",
                     VOLUME_PATH_MODEL,
                     "--dataset_dir",
diff --git a/test/e2e/v1beta1/scripts/gh-actions/setup-katib.sh b/test/e2e/v1beta1/scripts/gh-actions/setup-katib.sh
index d0b05caf712..68f5e6d1a5d 100755
--- a/test/e2e/v1beta1/scripts/gh-actions/setup-katib.sh
+++ b/test/e2e/v1beta1/scripts/gh-actions/setup-katib.sh
@@ -25,7 +25,7 @@ DEPLOY_TRAINING_OPERATOR=${2:-false}
 WITH_DATABASE_TYPE=${3:-mysql}
 
 E2E_TEST_IMAGE_TAG="e2e-test"
-TRAINING_OPERATOR_VERSION="v1.6.0-rc.0"
+TRAINING_OPERATOR_VERSION="v1.8.0"
 
 echo "Start to install Katib"
 

From 216ebd9a4411815dadc1134882c1acdd8be203d0 Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Tue, 3 Sep 2024 22:30:39 +0800
Subject: [PATCH 05/57] check the version of training operator

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 .github/workflows/e2e-test-tune-api.yaml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/e2e-test-tune-api.yaml b/.github/workflows/e2e-test-tune-api.yaml
index 2c45d75f99b..a7609e8c47d 100644
--- a/.github/workflows/e2e-test-tune-api.yaml
+++ b/.github/workflows/e2e-test-tune-api.yaml
@@ -24,7 +24,9 @@ jobs:
       
       - name: Install Training Operator SDK
         shell: bash
-        run: pip install -U kubeflow-training[huggingface]
+        run: |
+          pip install -U kubeflow-training[huggingface]
+          pip show kubeflow-training | grep Version
       
       - name: Run e2e test with tune API
         uses: ./.github/workflows/template-e2e-test

From f6b96f5e10708e18dff97c36f638ddef3b04ed73 Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Tue, 3 Sep 2024 22:55:27 +0800
Subject: [PATCH 06/57] debug

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py
index 944def8cd36..b339f0458e5 100644
--- a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py
+++ b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py
@@ -10,6 +10,7 @@
 )
 from kubernetes import client
 from peft import LoraConfig
+from typing import get_type_hints
 from verify import verify_experiment_results
 
 # Experiment timeout is 40 min.
@@ -71,6 +72,10 @@ def run_e2e_experiment_create_by_tune_with_external_model(
     exp_name: str,
     exp_namespace: str,
 ):
+    # Debugging: Print the module and annotations of HuggingFaceModelParams
+    print("HuggingFaceModelParams is defined in module:", HuggingFaceModelParams.__module__)
+    print("HuggingFaceModelParams annotations:", get_type_hints(HuggingFaceModelParams))
+
     # Create Katib Experiment and wait until it is finished.
     logging.debug("Creating Experiment: {}/{}".format(exp_namespace, exp_name))
     

From c6364932778bf632bc3ebbfd4ae6f71c2cc520ac Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Tue, 3 Sep 2024 23:15:21 +0800
Subject: [PATCH 07/57] check import path of HuggingFaceModelParams

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py
index b339f0458e5..0312d52b902 100644
--- a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py
+++ b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py
@@ -10,6 +10,7 @@
 )
 from kubernetes import client
 from peft import LoraConfig
+import sys
 from typing import get_type_hints
 from verify import verify_experiment_results
 
@@ -75,6 +76,7 @@ def run_e2e_experiment_create_by_tune_with_external_model(
     # Debugging: Print the module and annotations of HuggingFaceModelParams
     print("HuggingFaceModelParams is defined in module:", HuggingFaceModelParams.__module__)
     print("HuggingFaceModelParams annotations:", get_type_hints(HuggingFaceModelParams))
+    print(sys.modules['kubeflow'].HuggingFaceModelParams.__file__)
 
     # Create Katib Experiment and wait until it is finished.
     logging.debug("Creating Experiment: {}/{}".format(exp_namespace, exp_name))

From 8180422d050129d72bb80cf752cdb37db2c8aac2 Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Thu, 5 Sep 2024 14:41:36 +0800
Subject: [PATCH 08/57] update the version of training operator sdk

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 .github/workflows/e2e-test-tune-api.yaml               |  4 ++--
 .../e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py | 10 ++--------
 test/e2e/v1beta1/scripts/gh-actions/setup-katib.sh     |  2 +-
 3 files changed, 5 insertions(+), 11 deletions(-)

diff --git a/.github/workflows/e2e-test-tune-api.yaml b/.github/workflows/e2e-test-tune-api.yaml
index a7609e8c47d..e72e6f6ef9b 100644
--- a/.github/workflows/e2e-test-tune-api.yaml
+++ b/.github/workflows/e2e-test-tune-api.yaml
@@ -25,8 +25,8 @@ jobs:
       - name: Install Training Operator SDK
         shell: bash
         run: |
-          pip install -U kubeflow-training[huggingface]
-          pip show kubeflow-training | grep Version
+          pip install git+https://github.com/kubeflow/training-operator.git@v1.8-branch#subdirectory=sdk/python
+          pip install peft==0.3.0 datasets==2.15.0 transformers==4.38.0
       
       - name: Run e2e test with tune API
         uses: ./.github/workflows/template-e2e-test
diff --git a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py
index 0312d52b902..10bc75b41e6 100644
--- a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py
+++ b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py
@@ -1,6 +1,7 @@
 import argparse
 import logging
 
+import kubeflow.katib as katib
 import transformers
 from kubeflow.katib import KatibClient, search, types
 from kubeflow.storage_initializer.hugging_face import (
@@ -10,8 +11,6 @@
 )
 from kubernetes import client
 from peft import LoraConfig
-import sys
-from typing import get_type_hints
 from verify import verify_experiment_results
 
 # Experiment timeout is 40 min.
@@ -73,11 +72,6 @@ def run_e2e_experiment_create_by_tune_with_external_model(
     exp_name: str,
     exp_namespace: str,
 ):
-    # Debugging: Print the module and annotations of HuggingFaceModelParams
-    print("HuggingFaceModelParams is defined in module:", HuggingFaceModelParams.__module__)
-    print("HuggingFaceModelParams annotations:", get_type_hints(HuggingFaceModelParams))
-    print(sys.modules['kubeflow'].HuggingFaceModelParams.__file__)
-
     # Create Katib Experiment and wait until it is finished.
     logging.debug("Creating Experiment: {}/{}".format(exp_namespace, exp_name))
     
@@ -120,7 +114,7 @@ def run_e2e_experiment_create_by_tune_with_external_model(
         algorithm_name = "random",
         max_trial_count = 1,
         parallel_trial_count = 1,
-        resources_per_trial=types.TrainerResources(
+        resources_per_trial=katib.TrainerResources(
             num_workers=1,
             num_procs_per_worker=1,
             resources_per_worker={"cpu": "2", "memory": "10G",},
diff --git a/test/e2e/v1beta1/scripts/gh-actions/setup-katib.sh b/test/e2e/v1beta1/scripts/gh-actions/setup-katib.sh
index 68f5e6d1a5d..d0b05caf712 100755
--- a/test/e2e/v1beta1/scripts/gh-actions/setup-katib.sh
+++ b/test/e2e/v1beta1/scripts/gh-actions/setup-katib.sh
@@ -25,7 +25,7 @@ DEPLOY_TRAINING_OPERATOR=${2:-false}
 WITH_DATABASE_TYPE=${3:-mysql}
 
 E2E_TEST_IMAGE_TAG="e2e-test"
-TRAINING_OPERATOR_VERSION="v1.8.0"
+TRAINING_OPERATOR_VERSION="v1.6.0-rc.0"
 
 echo "Start to install Katib"
 

From 6101489db88f264cac13d08d6e8ff2213052ffa5 Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Thu, 5 Sep 2024 15:05:33 +0800
Subject: [PATCH 09/57] update the name of experiment

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 .../v1beta1/scripts/gh-actions/run-e2e-tune-api.py    | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py
index 10bc75b41e6..640cb2a595b 100644
--- a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py
+++ b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py
@@ -152,7 +152,7 @@ def run_e2e_experiment_create_by_tune_with_external_model(
         client.CoreV1Api().patch_namespace(args.namespace, {'metadata': {'labels': namespace_labels}})
 
     # Test with run_e2e_experiment_create_by_tune
-    exp_name = "tune-example"
+    exp_name = "tune-example-1"
     exp_namespace = args.namespace
     try:
         run_e2e_experiment_create_by_tune_with_custom_objective(katib_client, exp_name, exp_namespace)
@@ -168,16 +168,17 @@ def run_e2e_experiment_create_by_tune_with_external_model(
         logging.info("---------------------------------------------------------------")
         katib_client.delete_experiment(exp_name, exp_namespace)
     
+    exp_name_2 = "tune-example-2"
     try:
-        run_e2e_experiment_create_by_tune_with_external_model(katib_client, exp_name, exp_namespace)
+        run_e2e_experiment_create_by_tune_with_external_model(katib_client, exp_name_2, exp_namespace)
         logging.info("---------------------------------------------------------------")
-        logging.info(f"E2E is succeeded for Experiment created by tune: {exp_namespace}/{exp_name}")
+        logging.info(f"E2E is succeeded for Experiment created by tune: {exp_namespace}/{exp_name_2}")
     except Exception as e:
         logging.info("---------------------------------------------------------------")
-        logging.info(f"E2E is failed for Experiment created by tune: {exp_namespace}/{exp_name}")
+        logging.info(f"E2E is failed for Experiment created by tune: {exp_namespace}/{exp_name_2}")
         raise e
     finally:
         # Delete the Experiment.
         logging.info("---------------------------------------------------------------")
         logging.info("---------------------------------------------------------------")
-        katib_client.delete_experiment(exp_name, exp_namespace)
+        katib_client.delete_experiment(exp_name_2, exp_namespace)

From d67a1b8a0cb1f80b1a2b45f0d0149e8003cea822 Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Thu, 5 Sep 2024 15:51:12 +0800
Subject: [PATCH 10/57] add step of checking pod

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 .github/workflows/e2e-test-tune-api.yaml             | 12 ++++++++++++
 .../v1beta1/scripts/gh-actions/run-e2e-tune-api.py   |  2 +-
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/e2e-test-tune-api.yaml b/.github/workflows/e2e-test-tune-api.yaml
index e72e6f6ef9b..31d3585cff2 100644
--- a/.github/workflows/e2e-test-tune-api.yaml
+++ b/.github/workflows/e2e-test-tune-api.yaml
@@ -33,6 +33,18 @@ jobs:
         with:
           tune-api: true
           training-operator: true
+      
+      - name: Check the status of Experiment and Trials
+        shell: bash
+        run: |
+          kubectl get pods -n default
+          
+          # describe pod
+          pod_name=$(kubectl get pods -n default -o jsonpath='{.items[?(@.metadata.labels.trial-name)].metadata.name}')
+          kubectl describe pod $pod_name -n default
+          
+          # check the logs of pod
+          kubectl logs $pod_name -n default
 
     strategy:
       fail-fast: false
diff --git a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py
index 640cb2a595b..135f40c6ef8 100644
--- a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py
+++ b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py
@@ -14,7 +14,7 @@
 from verify import verify_experiment_results
 
 # Experiment timeout is 40 min.
-EXPERIMENT_TIMEOUT = 60 * 40
+EXPERIMENT_TIMEOUT = 60 * 10
 
 # The default logging config.
 logging.basicConfig(level=logging.INFO)

From 295abb6f1786ca80d8006aa2ce9205fe6515fafb Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Thu, 5 Sep 2024 17:02:02 +0800
Subject: [PATCH 11/57] check the logs of pod

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 .github/workflows/e2e-test-tune-api.yaml      | 12 -------
 .../scripts/gh-actions/run-e2e-tune-api.py    | 33 +++++++++++++++++--
 2 files changed, 30 insertions(+), 15 deletions(-)

diff --git a/.github/workflows/e2e-test-tune-api.yaml b/.github/workflows/e2e-test-tune-api.yaml
index 31d3585cff2..e72e6f6ef9b 100644
--- a/.github/workflows/e2e-test-tune-api.yaml
+++ b/.github/workflows/e2e-test-tune-api.yaml
@@ -33,18 +33,6 @@ jobs:
         with:
           tune-api: true
           training-operator: true
-      
-      - name: Check the status of Experiment and Trials
-        shell: bash
-        run: |
-          kubectl get pods -n default
-          
-          # describe pod
-          pod_name=$(kubectl get pods -n default -o jsonpath='{.items[?(@.metadata.labels.trial-name)].metadata.name}')
-          kubectl describe pod $pod_name -n default
-          
-          # check the logs of pod
-          kubectl logs $pod_name -n default
 
     strategy:
       fail-fast: false
diff --git a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py
index 135f40c6ef8..48496b864c6 100644
--- a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py
+++ b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py
@@ -3,13 +3,13 @@
 
 import kubeflow.katib as katib
 import transformers
-from kubeflow.katib import KatibClient, search, types
+from kubeflow.katib import KatibClient, search
 from kubeflow.storage_initializer.hugging_face import (
     HuggingFaceDatasetParams,
     HuggingFaceModelParams,
     HuggingFaceTrainerParams,
 )
-from kubernetes import client
+from kubernetes import client, config
 from peft import LoraConfig
 from verify import verify_experiment_results
 
@@ -19,6 +19,25 @@
 # The default logging config.
 logging.basicConfig(level=logging.INFO)
 
+# Function to get logs of the pod related to the experiment.
+def get_experiment_pod_logs(namespace: str, exp_name: str):
+    v1 = client.CoreV1Api()
+    pods = v1.list_namespaced_pod(namespace)
+    
+    for pod in pods.items:
+        # Identify the pod associated with the experiment
+        if exp_name in pod.metadata.name:
+            logging.info(f"Describing pod: {pod.metadata.name}")
+            pod_description = v1.read_namespaced_pod(name=pod.metadata.name, namespace=namespace)
+            logging.info(pod_description)
+            
+            logging.info(f"Fetching logs for pod: {pod.metadata.name}")
+            pod_logs = v1.read_namespaced_pod_log(name=pod.metadata.name, namespace=namespace)
+            logging.info(pod_logs)
+            break
+    else:
+        logging.warning(f"No pod found for experiment: {exp_name}")
+
 
 # Test for Experiment created with custom objective.
 def run_e2e_experiment_create_by_tune_with_custom_objective(
@@ -144,6 +163,8 @@ def run_e2e_experiment_create_by_tune_with_external_model(
     if args.verbose:
         logging.getLogger().setLevel(logging.DEBUG)
 
+    config.load_kube_config()  # Load Kubernetes config from the environment
+
     katib_client = KatibClient()
 
     namespace_labels = client.CoreV1Api().read_namespace(args.namespace).metadata.labels
@@ -163,6 +184,9 @@ def run_e2e_experiment_create_by_tune_with_external_model(
         logging.info(f"E2E is failed for Experiment created by tune: {exp_namespace}/{exp_name}")
         raise e
     finally:
+        # Describe and get logs of the experiment pod
+        get_experiment_pod_logs(exp_namespace, exp_name)
+
         # Delete the Experiment.
         logging.info("---------------------------------------------------------------")
         logging.info("---------------------------------------------------------------")
@@ -178,7 +202,10 @@ def run_e2e_experiment_create_by_tune_with_external_model(
         logging.info(f"E2E is failed for Experiment created by tune: {exp_namespace}/{exp_name_2}")
         raise e
     finally:
+        # Describe and get logs of the experiment pod
+        get_experiment_pod_logs(exp_namespace, exp_name_2)
+
         # Delete the Experiment.
         logging.info("---------------------------------------------------------------")
         logging.info("---------------------------------------------------------------")
-        katib_client.delete_experiment(exp_name_2, exp_namespace)
+        #katib_client.delete_experiment(exp_name_2, exp_namespace)

From e0a1b6dad38e555f604eae15a3f6aa3216bdbc37 Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Thu, 5 Sep 2024 17:17:33 +0800
Subject: [PATCH 12/57] add check

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 .github/workflows/e2e-test-tune-api.yaml | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/.github/workflows/e2e-test-tune-api.yaml b/.github/workflows/e2e-test-tune-api.yaml
index e72e6f6ef9b..31d3585cff2 100644
--- a/.github/workflows/e2e-test-tune-api.yaml
+++ b/.github/workflows/e2e-test-tune-api.yaml
@@ -33,6 +33,18 @@ jobs:
         with:
           tune-api: true
           training-operator: true
+      
+      - name: Check the status of Experiment and Trials
+        shell: bash
+        run: |
+          kubectl get pods -n default
+          
+          # describe pod
+          pod_name=$(kubectl get pods -n default -o jsonpath='{.items[?(@.metadata.labels.trial-name)].metadata.name}')
+          kubectl describe pod $pod_name -n default
+          
+          # check the logs of pod
+          kubectl logs $pod_name -n default
 
     strategy:
       fail-fast: false

From 1df7df953ba74fd1f0890f0a6074c5cd2039ee47 Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Thu, 5 Sep 2024 18:06:38 +0800
Subject: [PATCH 13/57] check reason for imagepullbackoff

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 .github/workflows/e2e-test-tune-api.yaml      | 12 -------
 .../workflows/template-e2e-test/action.yaml   | 17 +++++++++
 .../scripts/gh-actions/run-e2e-tune-api.py    | 35 +++----------------
 3 files changed, 21 insertions(+), 43 deletions(-)

diff --git a/.github/workflows/e2e-test-tune-api.yaml b/.github/workflows/e2e-test-tune-api.yaml
index 31d3585cff2..e72e6f6ef9b 100644
--- a/.github/workflows/e2e-test-tune-api.yaml
+++ b/.github/workflows/e2e-test-tune-api.yaml
@@ -33,18 +33,6 @@ jobs:
         with:
           tune-api: true
           training-operator: true
-      
-      - name: Check the status of Experiment and Trials
-        shell: bash
-        run: |
-          kubectl get pods -n default
-          
-          # describe pod
-          pod_name=$(kubectl get pods -n default -o jsonpath='{.items[?(@.metadata.labels.trial-name)].metadata.name}')
-          kubectl describe pod $pod_name -n default
-          
-          # check the logs of pod
-          kubectl logs $pod_name -n default
 
     strategy:
       fail-fast: false
diff --git a/.github/workflows/template-e2e-test/action.yaml b/.github/workflows/template-e2e-test/action.yaml
index 7c9598df04b..ef91c647cea 100644
--- a/.github/workflows/template-e2e-test/action.yaml
+++ b/.github/workflows/template-e2e-test/action.yaml
@@ -47,3 +47,20 @@ runs:
         else
           ./test/e2e/v1beta1/scripts/gh-actions/run-e2e-experiment.sh ${{ inputs.experiments }}
         fi
+    
+    - name: Check disk space
+      shell: bash
+      run: |
+        df -hT
+    
+    - name: Check the status of Experiment and Trials
+      shell: bash
+      run: |
+        kubectl get pods -n default
+        
+        # describe pod
+        pod_name=$(kubectl get pods -n default -o jsonpath='{.items[?(@.metadata.labels.trial-name)].metadata.name}')
+        kubectl describe pod $pod_name -n default
+        
+        # check the logs of pod
+        kubectl logs $pod_name -n default -c metrics-logger-and-collector
diff --git a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py
index 48496b864c6..d18c4b66c4e 100644
--- a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py
+++ b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py
@@ -9,7 +9,7 @@
     HuggingFaceModelParams,
     HuggingFaceTrainerParams,
 )
-from kubernetes import client, config
+from kubernetes import client
 from peft import LoraConfig
 from verify import verify_experiment_results
 
@@ -19,25 +19,6 @@
 # The default logging config.
 logging.basicConfig(level=logging.INFO)
 
-# Function to get logs of the pod related to the experiment.
-def get_experiment_pod_logs(namespace: str, exp_name: str):
-    v1 = client.CoreV1Api()
-    pods = v1.list_namespaced_pod(namespace)
-    
-    for pod in pods.items:
-        # Identify the pod associated with the experiment
-        if exp_name in pod.metadata.name:
-            logging.info(f"Describing pod: {pod.metadata.name}")
-            pod_description = v1.read_namespaced_pod(name=pod.metadata.name, namespace=namespace)
-            logging.info(pod_description)
-            
-            logging.info(f"Fetching logs for pod: {pod.metadata.name}")
-            pod_logs = v1.read_namespaced_pod_log(name=pod.metadata.name, namespace=namespace)
-            logging.info(pod_logs)
-            break
-    else:
-        logging.warning(f"No pod found for experiment: {exp_name}")
-
 
 # Test for Experiment created with custom objective.
 def run_e2e_experiment_create_by_tune_with_custom_objective(
@@ -163,8 +144,6 @@ def run_e2e_experiment_create_by_tune_with_external_model(
     if args.verbose:
         logging.getLogger().setLevel(logging.DEBUG)
 
-    config.load_kube_config()  # Load Kubernetes config from the environment
-
     katib_client = KatibClient()
 
     namespace_labels = client.CoreV1Api().read_namespace(args.namespace).metadata.labels
@@ -184,9 +163,6 @@ def run_e2e_experiment_create_by_tune_with_external_model(
         logging.info(f"E2E is failed for Experiment created by tune: {exp_namespace}/{exp_name}")
         raise e
     finally:
-        # Describe and get logs of the experiment pod
-        get_experiment_pod_logs(exp_namespace, exp_name)
-
         # Delete the Experiment.
         logging.info("---------------------------------------------------------------")
         logging.info("---------------------------------------------------------------")
@@ -201,11 +177,8 @@ def run_e2e_experiment_create_by_tune_with_external_model(
         logging.info("---------------------------------------------------------------")
         logging.info(f"E2E is failed for Experiment created by tune: {exp_namespace}/{exp_name_2}")
         raise e
-    finally:
-        # Describe and get logs of the experiment pod
-        get_experiment_pod_logs(exp_namespace, exp_name_2)
-
+    #finally:
         # Delete the Experiment.
-        logging.info("---------------------------------------------------------------")
-        logging.info("---------------------------------------------------------------")
+        #logging.info("---------------------------------------------------------------")
+        #logging.info("---------------------------------------------------------------")
         #katib_client.delete_experiment(exp_name_2, exp_namespace)

From d1e1311bd2af48e3b198f5ed411e96169cf58f2c Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Thu, 5 Sep 2024 19:17:19 +0800
Subject: [PATCH 14/57] revert timeout limit

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 .../workflows/template-e2e-test/action.yaml   | 18 +------------
 .../scripts/gh-actions/run-e2e-tune-api.py    | 27 +++++++++----------
 2 files changed, 14 insertions(+), 31 deletions(-)

diff --git a/.github/workflows/template-e2e-test/action.yaml b/.github/workflows/template-e2e-test/action.yaml
index ef91c647cea..f5ea534cd2e 100644
--- a/.github/workflows/template-e2e-test/action.yaml
+++ b/.github/workflows/template-e2e-test/action.yaml
@@ -47,20 +47,4 @@ runs:
         else
           ./test/e2e/v1beta1/scripts/gh-actions/run-e2e-experiment.sh ${{ inputs.experiments }}
         fi
-    
-    - name: Check disk space
-      shell: bash
-      run: |
-        df -hT
-    
-    - name: Check the status of Experiment and Trials
-      shell: bash
-      run: |
-        kubectl get pods -n default
-        
-        # describe pod
-        pod_name=$(kubectl get pods -n default -o jsonpath='{.items[?(@.metadata.labels.trial-name)].metadata.name}')
-        kubectl describe pod $pod_name -n default
-        
-        # check the logs of pod
-        kubectl logs $pod_name -n default -c metrics-logger-and-collector
+
diff --git a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py
index d18c4b66c4e..a425a3ea105 100644
--- a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py
+++ b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py
@@ -14,7 +14,7 @@
 from verify import verify_experiment_results
 
 # Experiment timeout is 40 min.
-EXPERIMENT_TIMEOUT = 60 * 10
+EXPERIMENT_TIMEOUT = 60 * 40
 
 # The default logging config.
 logging.basicConfig(level=logging.INFO)
@@ -152,33 +152,32 @@ def run_e2e_experiment_create_by_tune_with_external_model(
         client.CoreV1Api().patch_namespace(args.namespace, {'metadata': {'labels': namespace_labels}})
 
     # Test with run_e2e_experiment_create_by_tune
-    exp_name = "tune-example-1"
+    exp_name = "tune-example"
     exp_namespace = args.namespace
     try:
-        run_e2e_experiment_create_by_tune_with_custom_objective(katib_client, exp_name, exp_namespace)
+        run_e2e_experiment_create_by_tune_with_custom_objective(katib_client, f"{exp_name}-1", exp_namespace)
         logging.info("---------------------------------------------------------------")
-        logging.info(f"E2E is succeeded for Experiment created by tune: {exp_namespace}/{exp_name}")
+        logging.info(f"E2E is succeeded for Experiment created by tune: {exp_namespace}/{f"{exp_name}-1"}")
     except Exception as e:
         logging.info("---------------------------------------------------------------")
-        logging.info(f"E2E is failed for Experiment created by tune: {exp_namespace}/{exp_name}")
+        logging.info(f"E2E is failed for Experiment created by tune: {exp_namespace}/{f"{exp_name}-1"}")
         raise e
     finally:
         # Delete the Experiment.
         logging.info("---------------------------------------------------------------")
         logging.info("---------------------------------------------------------------")
-        katib_client.delete_experiment(exp_name, exp_namespace)
+        katib_client.delete_experiment(f"{exp_name}-1", exp_namespace)
     
-    exp_name_2 = "tune-example-2"
     try:
-        run_e2e_experiment_create_by_tune_with_external_model(katib_client, exp_name_2, exp_namespace)
+        run_e2e_experiment_create_by_tune_with_external_model(katib_client, f"{exp_name}-2", exp_namespace)
         logging.info("---------------------------------------------------------------")
-        logging.info(f"E2E is succeeded for Experiment created by tune: {exp_namespace}/{exp_name_2}")
+        logging.info(f"E2E is succeeded for Experiment created by tune: {exp_namespace}/{f"{exp_name}-2"}")
     except Exception as e:
         logging.info("---------------------------------------------------------------")
-        logging.info(f"E2E is failed for Experiment created by tune: {exp_namespace}/{exp_name_2}")
+        logging.info(f"E2E is failed for Experiment created by tune: {exp_namespace}/{f"{exp_name}-2"}")
         raise e
-    #finally:
+    finally:
         # Delete the Experiment.
-        #logging.info("---------------------------------------------------------------")
-        #logging.info("---------------------------------------------------------------")
-        #katib_client.delete_experiment(exp_name_2, exp_namespace)
+        logging.info("---------------------------------------------------------------")
+        logging.info("---------------------------------------------------------------")
+        katib_client.delete_experiment(f"{exp_name}-2", exp_namespace)

From 0cc319f7611593c32efbfc3eb603c4179d84b7bc Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Thu, 5 Sep 2024 19:32:18 +0800
Subject: [PATCH 15/57] fix format

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 .github/workflows/template-e2e-test/action.yaml        |  1 -
 .../e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py | 10 +++++-----
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/template-e2e-test/action.yaml b/.github/workflows/template-e2e-test/action.yaml
index f5ea534cd2e..7c9598df04b 100644
--- a/.github/workflows/template-e2e-test/action.yaml
+++ b/.github/workflows/template-e2e-test/action.yaml
@@ -47,4 +47,3 @@ runs:
         else
           ./test/e2e/v1beta1/scripts/gh-actions/run-e2e-experiment.sh ${{ inputs.experiments }}
         fi
-
diff --git a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py
index a425a3ea105..707c8a431ba 100644
--- a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py
+++ b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py
@@ -157,24 +157,24 @@ def run_e2e_experiment_create_by_tune_with_external_model(
     try:
         run_e2e_experiment_create_by_tune_with_custom_objective(katib_client, f"{exp_name}-1", exp_namespace)
         logging.info("---------------------------------------------------------------")
-        logging.info(f"E2E is succeeded for Experiment created by tune: {exp_namespace}/{f"{exp_name}-1"}")
+        logging.info(f"E2E is succeeded for Experiment created by tune: {exp_namespace}/{exp_name}-1")
     except Exception as e:
         logging.info("---------------------------------------------------------------")
-        logging.info(f"E2E is failed for Experiment created by tune: {exp_namespace}/{f"{exp_name}-1"}")
+        logging.info(f"E2E is failed for Experiment created by tune: {exp_namespace}/{exp_name}-1")
         raise e
     finally:
         # Delete the Experiment.
         logging.info("---------------------------------------------------------------")
         logging.info("---------------------------------------------------------------")
         katib_client.delete_experiment(f"{exp_name}-1", exp_namespace)
-    
+
     try:
         run_e2e_experiment_create_by_tune_with_external_model(katib_client, f"{exp_name}-2", exp_namespace)
         logging.info("---------------------------------------------------------------")
-        logging.info(f"E2E is succeeded for Experiment created by tune: {exp_namespace}/{f"{exp_name}-2"}")
+        logging.info(f"E2E is succeeded for Experiment created by tune: {exp_namespace}/{exp_name}-2")
     except Exception as e:
         logging.info("---------------------------------------------------------------")
-        logging.info(f"E2E is failed for Experiment created by tune: {exp_namespace}/{f"{exp_name}-2"}")
+        logging.info(f"E2E is failed for Experiment created by tune: {exp_namespace}/{exp_name}-2")
         raise e
     finally:
         # Delete the Experiment.

From 03839326e70418bd9a44e55ea77869fa9155632c Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Thu, 12 Sep 2024 18:00:12 -0600
Subject: [PATCH 16/57] extend timeout limit

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py
index 707c8a431ba..e27d0d81d07 100644
--- a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py
+++ b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py
@@ -13,8 +13,8 @@
 from peft import LoraConfig
 from verify import verify_experiment_results
 
-# Experiment timeout is 40 min.
-EXPERIMENT_TIMEOUT = 60 * 40
+# Experiment timeout is 60 min.
+EXPERIMENT_TIMEOUT = 60 * 60
 
 # The default logging config.
 logging.basicConfig(level=logging.INFO)

From 08c86343d22ba5698140097a4e9f2cd80e71f86b Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Thu, 12 Sep 2024 18:01:51 -0600
Subject: [PATCH 17/57] update training operator sdk version

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 .github/workflows/e2e-test-tune-api.yaml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/.github/workflows/e2e-test-tune-api.yaml b/.github/workflows/e2e-test-tune-api.yaml
index e72e6f6ef9b..c8a728ea391 100644
--- a/.github/workflows/e2e-test-tune-api.yaml
+++ b/.github/workflows/e2e-test-tune-api.yaml
@@ -25,8 +25,7 @@ jobs:
       - name: Install Training Operator SDK
         shell: bash
         run: |
-          pip install git+https://github.com/kubeflow/training-operator.git@v1.8-branch#subdirectory=sdk/python
-          pip install peft==0.3.0 datasets==2.15.0 transformers==4.38.0
+          pip install "kubeflow-training[huggingface]==1.8.1"
       
       - name: Run e2e test with tune API
         uses: ./.github/workflows/template-e2e-test

From 7a98a001a0e2e5467269f84992dede8214c269e8 Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Thu, 12 Sep 2024 22:54:10 -0600
Subject: [PATCH 18/57] check the logs of pod

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 .github/workflows/e2e-test-tune-api.yaml      |  7 +++++
 .../scripts/gh-actions/run-e2e-tune-api.py    | 27 ++++++++++++++++---
 2 files changed, 30 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/e2e-test-tune-api.yaml b/.github/workflows/e2e-test-tune-api.yaml
index c8a728ea391..9b9d9658410 100644
--- a/.github/workflows/e2e-test-tune-api.yaml
+++ b/.github/workflows/e2e-test-tune-api.yaml
@@ -33,6 +33,13 @@ jobs:
           tune-api: true
           training-operator: true
 
+      # Step to get logs of the relevant Experiment pod
+      - name: Fetch Experiment Pod Logs
+        run: |
+          POD_NAME=$(kubectl get pods -n default --no-headers -o custom-columns=":metadata.name" | grep tune-example-2)
+          echo "Fetching logs for pod: $POD_NAME"
+          kubectl logs $POD_NAME -n default
+
     strategy:
       fail-fast: false
       matrix:
diff --git a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py
index e27d0d81d07..e35df2aa204 100644
--- a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py
+++ b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py
@@ -14,12 +14,30 @@
 from verify import verify_experiment_results
 
 # Experiment timeout is 60 min.
-EXPERIMENT_TIMEOUT = 60 * 60
+EXPERIMENT_TIMEOUT = 60 * 15
 
 # The default logging config.
 logging.basicConfig(level=logging.INFO)
 
 
+def get_experiment_pods_logs(katib_client: KatibClient, exp_name: str, exp_namespace: str):
+    # List all the pods in the namespace
+    v1 = client.CoreV1Api()
+    pods = v1.list_namespaced_pod(namespace=exp_namespace)
+    
+    # Filter pods related to the specific Katib Experiment
+    for pod in pods.items:
+        if exp_name in pod.metadata.name:
+            logging.info(f"Fetching logs for pod: {pod.metadata.name}")
+            try:
+                pod_logs = v1.read_namespaced_pod_log(
+                    name=pod.metadata.name, namespace=exp_namespace
+                )
+                logging.info(f"Logs for pod {pod.metadata.name}:\n{pod_logs}")
+            except Exception as e:
+                logging.error(f"Failed to get logs for pod {pod.metadata.name}: {str(e)}")
+
+
 # Test for Experiment created with custom objective.
 def run_e2e_experiment_create_by_tune_with_custom_objective(
     katib_client: KatibClient,
@@ -117,7 +135,7 @@ def run_e2e_experiment_create_by_tune_with_external_model(
         resources_per_trial=katib.TrainerResources(
             num_workers=1,
             num_procs_per_worker=1,
-            resources_per_worker={"cpu": "2", "memory": "10G",},
+            resources_per_worker={"cpu": "1", "memory": "10G",},
         ),
     )
     experiment = katib_client.wait_for_experiment_condition(
@@ -166,7 +184,7 @@ def run_e2e_experiment_create_by_tune_with_external_model(
         # Delete the Experiment.
         logging.info("---------------------------------------------------------------")
         logging.info("---------------------------------------------------------------")
-        katib_client.delete_experiment(f"{exp_name}-1", exp_namespace)
+        #katib_client.delete_experiment(f"{exp_name}-1", exp_namespace)
 
     try:
         run_e2e_experiment_create_by_tune_with_external_model(katib_client, f"{exp_name}-2", exp_namespace)
@@ -175,9 +193,10 @@ def run_e2e_experiment_create_by_tune_with_external_model(
     except Exception as e:
         logging.info("---------------------------------------------------------------")
         logging.info(f"E2E is failed for Experiment created by tune: {exp_namespace}/{exp_name}-2")
+        get_experiment_pods_logs(katib_client, f"{exp_name}-2", exp_namespace)
         raise e
     finally:
         # Delete the Experiment.
         logging.info("---------------------------------------------------------------")
         logging.info("---------------------------------------------------------------")
-        katib_client.delete_experiment(f"{exp_name}-2", exp_namespace)
+        #katib_client.delete_experiment(f"{exp_name}-2", exp_namespace)

From 8862d7965c4b03db2cdb5f36f73a14eaf4af10a9 Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Thu, 12 Sep 2024 22:57:06 -0600
Subject: [PATCH 19/57] rerun tests

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py
index e35df2aa204..3a7c4949cc7 100644
--- a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py
+++ b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py
@@ -14,7 +14,7 @@
 from verify import verify_experiment_results
 
 # Experiment timeout is 60 min.
-EXPERIMENT_TIMEOUT = 60 * 15
+EXPERIMENT_TIMEOUT = 60 * 10
 
 # The default logging config.
 logging.basicConfig(level=logging.INFO)

From e4f614dd1e140a6464a3193542a7c664b8c0783d Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Fri, 13 Sep 2024 22:13:47 -0600
Subject: [PATCH 20/57] update the function of getting logs

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py
index 3a7c4949cc7..5dae31ade3e 100644
--- a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py
+++ b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py
@@ -30,10 +30,13 @@ def get_experiment_pods_logs(katib_client: KatibClient, exp_name: str, exp_names
         if exp_name in pod.metadata.name:
             logging.info(f"Fetching logs for pod: {pod.metadata.name}")
             try:
+                # Specify the container name when retrieving logs
                 pod_logs = v1.read_namespaced_pod_log(
-                    name=pod.metadata.name, namespace=exp_namespace
+                    name=pod.metadata.name,
+                    namespace=exp_namespace,
+                    container="metrics-logger-and-collector"  # Specify the desired container
                 )
-                logging.info(f"Logs for pod {pod.metadata.name}:\n{pod_logs}")
+                logging.info(f"Logs for pod {pod.metadata.name} (container: metrics-logger-and-collector):\n{pod_logs}")
             except Exception as e:
                 logging.error(f"Failed to get logs for pod {pod.metadata.name}: {str(e)}")
 

From 0385eeaef053ce7f27cca0adb1146b67c93f6e69 Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Fri, 13 Sep 2024 22:48:27 -0600
Subject: [PATCH 21/57] add the step of describing pod

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 .github/workflows/e2e-test-tune-api.yaml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/e2e-test-tune-api.yaml b/.github/workflows/e2e-test-tune-api.yaml
index 9b9d9658410..612463bbbfc 100644
--- a/.github/workflows/e2e-test-tune-api.yaml
+++ b/.github/workflows/e2e-test-tune-api.yaml
@@ -35,9 +35,11 @@ jobs:
 
       # Step to get logs of the relevant Experiment pod
       - name: Fetch Experiment Pod Logs
+        if: always()  # Run this step even if previous steps fail
         run: |
           POD_NAME=$(kubectl get pods -n default --no-headers -o custom-columns=":metadata.name" | grep tune-example-2)
           echo "Fetching logs for pod: $POD_NAME"
+          kubectl describe pod $POD_NAME -n default
           kubectl logs $POD_NAME -n default
 
     strategy:

From e0c51704111109314e5242d57050a542a77a5d8b Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Fri, 13 Sep 2024 23:28:49 -0600
Subject: [PATCH 22/57] check disk space

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 .github/workflows/e2e-test-tune-api.yaml | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/.github/workflows/e2e-test-tune-api.yaml b/.github/workflows/e2e-test-tune-api.yaml
index 612463bbbfc..6d10e3569ff 100644
--- a/.github/workflows/e2e-test-tune-api.yaml
+++ b/.github/workflows/e2e-test-tune-api.yaml
@@ -27,11 +27,24 @@ jobs:
         run: |
           pip install "kubeflow-training[huggingface]==1.8.1"
       
+      # Step to check disk space
+      - name: Check Disk Space
+        run: |
+          echo "Checking disk space usage before e2e test..."
+          df -h  # Run 'df' to check free disk space
+      
       - name: Run e2e test with tune API
         uses: ./.github/workflows/template-e2e-test
         with:
           tune-api: true
           training-operator: true
+      
+      # Step to check disk space
+      - name: Check Disk Space
+        if: always()  # Run this step even if previous steps fail
+        run: |
+          echo "Checking disk space usage after e2e test..."
+          df -h  # Run 'df' to check free disk space
 
       # Step to get logs of the relevant Experiment pod
       - name: Fetch Experiment Pod Logs

From 0286f7077a52fb4bd106d2dc2e71021d3ae04c56 Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Mon, 16 Sep 2024 20:19:58 -0600
Subject: [PATCH 23/57] change work directory

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 .github/workflows/e2e-test-tune-api.yaml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/e2e-test-tune-api.yaml b/.github/workflows/e2e-test-tune-api.yaml
index 6d10e3569ff..0748c681fbb 100644
--- a/.github/workflows/e2e-test-tune-api.yaml
+++ b/.github/workflows/e2e-test-tune-api.yaml
@@ -13,6 +13,8 @@ jobs:
   e2e:
     runs-on: ubuntu-22.04
     timeout-minutes: 120
+    env:
+      GITHUB_WORKSPACE: /mnt/docker
     steps:
       - name: Checkout
         uses: actions/checkout@v4

From f6e5ed569d86047d0cace7cec709831cbaa6f4e2 Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Mon, 16 Sep 2024 21:02:09 -0600
Subject: [PATCH 24/57] change work directory

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 .github/workflows/e2e-test-tune-api.yaml | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/e2e-test-tune-api.yaml b/.github/workflows/e2e-test-tune-api.yaml
index 0748c681fbb..329b2ae6173 100644
--- a/.github/workflows/e2e-test-tune-api.yaml
+++ b/.github/workflows/e2e-test-tune-api.yaml
@@ -13,11 +13,27 @@ jobs:
   e2e:
     runs-on: ubuntu-22.04
     timeout-minutes: 120
-    env:
-      GITHUB_WORKSPACE: /mnt/docker
     steps:
       - name: Checkout
         uses: actions/checkout@v4
+      
+      - name: Move docker data directory
+        shell: bash
+        run: |
+          echo "Stopping docker service ..."
+          sudo systemctl stop docker
+          DOCKER_DEFAULT_ROOT_DIR=/var/lib/docker
+          DOCKER_ROOT_DIR=/mnt/docker
+          echo "Moving ${DOCKER_DEFAULT_ROOT_DIR} -> ${DOCKER_ROOT_DIR}"
+          sudo mv ${DOCKER_DEFAULT_ROOT_DIR} ${DOCKER_ROOT_DIR}
+          echo "Creating symlink ${DOCKER_DEFAULT_ROOT_DIR} -> ${DOCKER_ROOT_DIR}"
+          sudo ln -s ${DOCKER_ROOT_DIR} ${DOCKER_DEFAULT_ROOT_DIR}
+          echo "$(sudo ls -l ${DOCKER_DEFAULT_ROOT_DIR})"
+          echo "Starting docker service ..."
+          sudo systemctl daemon-reload
+          sudo systemctl start docker
+          echo "Docker service status:"
+          sudo systemctl --no-pager -l -o short status docker
 
       - name: Setup Test Env
         uses: ./.github/workflows/template-setup-e2e-test

From 7ea7e43b17fc1b2d4e6654bcd835c6485df5ca58 Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Mon, 16 Sep 2024 21:36:56 -0600
Subject: [PATCH 25/57] increase timeout limit

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py
index 5dae31ade3e..1a5f9eed99d 100644
--- a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py
+++ b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py
@@ -13,8 +13,8 @@
 from peft import LoraConfig
 from verify import verify_experiment_results
 
-# Experiment timeout is 60 min.
-EXPERIMENT_TIMEOUT = 60 * 10
+# Experiment timeout is 40 min.
+EXPERIMENT_TIMEOUT = 60 * 40
 
 # The default logging config.
 logging.basicConfig(level=logging.INFO)

From 25d99b198fbbc27b580039c03410906f91ba009f Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Mon, 16 Sep 2024 22:46:52 -0600
Subject: [PATCH 26/57] check the logs of controller and events

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 .github/workflows/e2e-test-tune-api.yaml                | 2 ++
 test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py | 3 ++-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/e2e-test-tune-api.yaml b/.github/workflows/e2e-test-tune-api.yaml
index 329b2ae6173..68426f23bed 100644
--- a/.github/workflows/e2e-test-tune-api.yaml
+++ b/.github/workflows/e2e-test-tune-api.yaml
@@ -68,10 +68,12 @@ jobs:
       - name: Fetch Experiment Pod Logs
         if: always()  # Run this step even if previous steps fail
         run: |
+          kubectl get pods -n default
           POD_NAME=$(kubectl get pods -n default --no-headers -o custom-columns=":metadata.name" | grep tune-example-2)
           echo "Fetching logs for pod: $POD_NAME"
           kubectl describe pod $POD_NAME -n default
           kubectl logs $POD_NAME -n default
+          kubectl get events -n default | grep "tune-example-2"
 
     strategy:
       fail-fast: false
diff --git a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py
index 1a5f9eed99d..9e327ac6adf 100644
--- a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py
+++ b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py
@@ -14,7 +14,7 @@
 from verify import verify_experiment_results
 
 # Experiment timeout is 40 min.
-EXPERIMENT_TIMEOUT = 60 * 40
+EXPERIMENT_TIMEOUT = 60 * 15
 
 # The default logging config.
 logging.basicConfig(level=logging.INFO)
@@ -197,6 +197,7 @@ def run_e2e_experiment_create_by_tune_with_external_model(
         logging.info("---------------------------------------------------------------")
         logging.info(f"E2E is failed for Experiment created by tune: {exp_namespace}/{exp_name}-2")
         get_experiment_pods_logs(katib_client, f"{exp_name}-2", exp_namespace)
+        get_experiment_pods_logs(katib_client, "katib-controller", "kubeflow")
         raise e
     finally:
         # Delete the Experiment.

From fcd64faad5bbd38a5849e4bde44ed8f927c1b85d Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Tue, 17 Sep 2024 19:32:28 -0700
Subject: [PATCH 27/57] change work directory

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 .github/workflows/e2e-test-tune-api.yaml      | 18 ----------------
 .../workflows/template-e2e-test/action.yaml   |  1 +
 .../scripts/gh-actions/run-e2e-tune-api.py    | 21 ++++++++++++++++++-
 3 files changed, 21 insertions(+), 19 deletions(-)

diff --git a/.github/workflows/e2e-test-tune-api.yaml b/.github/workflows/e2e-test-tune-api.yaml
index 68426f23bed..909d0022ce5 100644
--- a/.github/workflows/e2e-test-tune-api.yaml
+++ b/.github/workflows/e2e-test-tune-api.yaml
@@ -16,24 +16,6 @@ jobs:
     steps:
       - name: Checkout
         uses: actions/checkout@v4
-      
-      - name: Move docker data directory
-        shell: bash
-        run: |
-          echo "Stopping docker service ..."
-          sudo systemctl stop docker
-          DOCKER_DEFAULT_ROOT_DIR=/var/lib/docker
-          DOCKER_ROOT_DIR=/mnt/docker
-          echo "Moving ${DOCKER_DEFAULT_ROOT_DIR} -> ${DOCKER_ROOT_DIR}"
-          sudo mv ${DOCKER_DEFAULT_ROOT_DIR} ${DOCKER_ROOT_DIR}
-          echo "Creating symlink ${DOCKER_DEFAULT_ROOT_DIR} -> ${DOCKER_ROOT_DIR}"
-          sudo ln -s ${DOCKER_ROOT_DIR} ${DOCKER_DEFAULT_ROOT_DIR}
-          echo "$(sudo ls -l ${DOCKER_DEFAULT_ROOT_DIR})"
-          echo "Starting docker service ..."
-          sudo systemctl daemon-reload
-          sudo systemctl start docker
-          echo "Docker service status:"
-          sudo systemctl --no-pager -l -o short status docker
 
       - name: Setup Test Env
         uses: ./.github/workflows/template-setup-e2e-test
diff --git a/.github/workflows/template-e2e-test/action.yaml b/.github/workflows/template-e2e-test/action.yaml
index 7c9598df04b..c4a8c8831e4 100644
--- a/.github/workflows/template-e2e-test/action.yaml
+++ b/.github/workflows/template-e2e-test/action.yaml
@@ -47,3 +47,4 @@ runs:
         else
           ./test/e2e/v1beta1/scripts/gh-actions/run-e2e-experiment.sh ${{ inputs.experiments }}
         fi
+      working-directory: /mnt/docker
diff --git a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py
index 9e327ac6adf..39c6d683488 100644
--- a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py
+++ b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py
@@ -36,6 +36,25 @@ def get_experiment_pods_logs(katib_client: KatibClient, exp_name: str, exp_names
                     namespace=exp_namespace,
                     container="metrics-logger-and-collector"  # Specify the desired container
                 )
+                logging.info(f"Logs for pod {pod.metadata.name}:\n{pod_logs}")
+            except Exception as e:
+                logging.error(f"Failed to get logs for pod {pod.metadata.name}: {str(e)}")
+
+def get_experiment_pods_logs_2(katib_client: KatibClient, exp_name: str, exp_namespace: str):
+    # List all the pods in the namespace
+    v1 = client.CoreV1Api()
+    pods = v1.list_namespaced_pod(namespace=exp_namespace)
+    
+    # Filter pods related to the specific Katib Experiment
+    for pod in pods.items:
+        if exp_name in pod.metadata.name:
+            logging.info(f"Fetching logs for pod: {pod.metadata.name}")
+            try:
+                # Specify the container name when retrieving logs
+                pod_logs = v1.read_namespaced_pod_log(
+                    name=pod.metadata.name,
+                    namespace=exp_namespace,
+                )
                 logging.info(f"Logs for pod {pod.metadata.name} (container: metrics-logger-and-collector):\n{pod_logs}")
             except Exception as e:
                 logging.error(f"Failed to get logs for pod {pod.metadata.name}: {str(e)}")
@@ -197,7 +216,7 @@ def run_e2e_experiment_create_by_tune_with_external_model(
         logging.info("---------------------------------------------------------------")
         logging.info(f"E2E is failed for Experiment created by tune: {exp_namespace}/{exp_name}-2")
         get_experiment_pods_logs(katib_client, f"{exp_name}-2", exp_namespace)
-        get_experiment_pods_logs(katib_client, "katib-controller", "kubeflow")
+        get_experiment_pods_logs_2(katib_client, "katib-controller", "kubeflow")
         raise e
     finally:
         # Delete the Experiment.

From 122c6115d2ed8c884adb4ef5b59b552f4ad029db Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Tue, 17 Sep 2024 19:43:19 -0700
Subject: [PATCH 28/57] change work directory

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 .github/workflows/template-e2e-test/action.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/template-e2e-test/action.yaml b/.github/workflows/template-e2e-test/action.yaml
index c4a8c8831e4..dd58cb12e46 100644
--- a/.github/workflows/template-e2e-test/action.yaml
+++ b/.github/workflows/template-e2e-test/action.yaml
@@ -47,4 +47,4 @@ runs:
         else
           ./test/e2e/v1beta1/scripts/gh-actions/run-e2e-experiment.sh ${{ inputs.experiments }}
         fi
-      working-directory: /mnt/docker
+      working-directory: /mnt

From c1fde099fc0f4f8753d2d5fa57824f3846f962ee Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Tue, 17 Sep 2024 20:31:25 -0700
Subject: [PATCH 29/57] change work directory

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 .../workflows/template-e2e-test/action.yaml   |  1 -
 .../template-setup-e2e-test/action.yaml       | 25 +++++++++++++++++++
 2 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/template-e2e-test/action.yaml b/.github/workflows/template-e2e-test/action.yaml
index dd58cb12e46..7c9598df04b 100644
--- a/.github/workflows/template-e2e-test/action.yaml
+++ b/.github/workflows/template-e2e-test/action.yaml
@@ -47,4 +47,3 @@ runs:
         else
           ./test/e2e/v1beta1/scripts/gh-actions/run-e2e-experiment.sh ${{ inputs.experiments }}
         fi
-      working-directory: /mnt
diff --git a/.github/workflows/template-setup-e2e-test/action.yaml b/.github/workflows/template-setup-e2e-test/action.yaml
index 75ee040aea2..657113afc4d 100644
--- a/.github/workflows/template-setup-e2e-test/action.yaml
+++ b/.github/workflows/template-setup-e2e-test/action.yaml
@@ -30,6 +30,31 @@ runs:
 
         echo "Disk usage after cleanup:"
         df -h
+    
+    - name: Prune docker images
+      shell: bash
+      run: |
+        docker image prune -a -f
+        docker system df
+        df -hT
+  
+    - name: Move docker data directory
+      shell: bash
+      run: |
+        echo "Stopping docker service ..."
+        sudo systemctl stop docker
+        DOCKER_DEFAULT_ROOT_DIR=/var/lib/docker
+        DOCKER_ROOT_DIR=/mnt/docker
+        echo "Moving ${DOCKER_DEFAULT_ROOT_DIR} -> ${DOCKER_ROOT_DIR}"
+        sudo mv ${DOCKER_DEFAULT_ROOT_DIR} ${DOCKER_ROOT_DIR}
+        echo "Creating symlink ${DOCKER_DEFAULT_ROOT_DIR} -> ${DOCKER_ROOT_DIR}"
+        sudo ln -s ${DOCKER_ROOT_DIR} ${DOCKER_DEFAULT_ROOT_DIR}
+        echo "$(sudo ls -l ${DOCKER_DEFAULT_ROOT_DIR})"
+        echo "Starting docker service ..."
+        sudo systemctl daemon-reload
+        sudo systemctl start docker
+        echo "Docker service status:"
+        sudo systemctl --no-pager -l -o short status docker  
 
     - name: Setup kubectl
       uses: azure/setup-kubectl@v4

From 8ff6864ae9f7b6f2de4abb715ccabbe11bd653a5 Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Tue, 17 Sep 2024 21:41:16 -0700
Subject: [PATCH 30/57] check the logs of kubelet

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 .github/workflows/e2e-test-tune-api.yaml | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/.github/workflows/e2e-test-tune-api.yaml b/.github/workflows/e2e-test-tune-api.yaml
index 909d0022ce5..f374c72c291 100644
--- a/.github/workflows/e2e-test-tune-api.yaml
+++ b/.github/workflows/e2e-test-tune-api.yaml
@@ -56,6 +56,14 @@ jobs:
           kubectl describe pod $POD_NAME -n default
           kubectl logs $POD_NAME -n default
           kubectl get events -n default | grep "tune-example-2"
+        
+      # Step to fetch kubelet logs from Minikube
+      - name: Fetch Kubelet Logs
+        if: always()  # Run this step even if previous steps fail
+        shell: bash
+        run: |
+          echo "Fetching kubelet logs from Minikube..."
+          minikube ssh "sudo journalctl -u kubelet"
 
     strategy:
       fail-fast: false

From da3c298c1ae4d3cac398bbab5f4ab56974d93f6e Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Tue, 17 Sep 2024 22:11:05 -0700
Subject: [PATCH 31/57] check the logs of kubelet

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 .github/workflows/e2e-test-tune-api.yaml                | 4 ++--
 test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/e2e-test-tune-api.yaml b/.github/workflows/e2e-test-tune-api.yaml
index f374c72c291..3f7219e1077 100644
--- a/.github/workflows/e2e-test-tune-api.yaml
+++ b/.github/workflows/e2e-test-tune-api.yaml
@@ -62,8 +62,8 @@ jobs:
         if: always()  # Run this step even if previous steps fail
         shell: bash
         run: |
-          echo "Fetching kubelet logs from Minikube..."
-          minikube ssh "sudo journalctl -u kubelet"
+          echo "Fetching kubelet logs..."
+          sudo journalctl -u kubelet
 
     strategy:
       fail-fast: false
diff --git a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py
index 39c6d683488..f83eb639cbd 100644
--- a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py
+++ b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py
@@ -14,7 +14,7 @@
 from verify import verify_experiment_results
 
 # Experiment timeout is 40 min.
-EXPERIMENT_TIMEOUT = 60 * 15
+EXPERIMENT_TIMEOUT = 60 * 10
 
 # The default logging config.
 logging.basicConfig(level=logging.INFO)

From a1bff26b6ba3999c15c50e77eba096aacd4fc41d Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Wed, 18 Sep 2024 17:49:44 -0700
Subject: [PATCH 32/57] increase cpu

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 .github/workflows/e2e-test-tune-api.yaml                | 1 +
 test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/e2e-test-tune-api.yaml b/.github/workflows/e2e-test-tune-api.yaml
index 3f7219e1077..d657928b07e 100644
--- a/.github/workflows/e2e-test-tune-api.yaml
+++ b/.github/workflows/e2e-test-tune-api.yaml
@@ -55,6 +55,7 @@ jobs:
           echo "Fetching logs for pod: $POD_NAME"
           kubectl describe pod $POD_NAME -n default
           kubectl logs $POD_NAME -n default
+          kubectl top pods $POD_NAME
           kubectl get events -n default | grep "tune-example-2"
         
       # Step to fetch kubelet logs from Minikube
diff --git a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py
index f83eb639cbd..eeac5d10d7f 100644
--- a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py
+++ b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py
@@ -157,7 +157,7 @@ def run_e2e_experiment_create_by_tune_with_external_model(
         resources_per_trial=katib.TrainerResources(
             num_workers=1,
             num_procs_per_worker=1,
-            resources_per_worker={"cpu": "1", "memory": "10G",},
+            resources_per_worker={"cpu": "2", "memory": "10G",},
         ),
     )
     experiment = katib_client.wait_for_experiment_condition(

From bbae57bb3c217756f61f5220d5be2ba873455e5c Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Wed, 18 Sep 2024 18:27:46 -0700
Subject: [PATCH 33/57] check the logs of training operator

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 .github/workflows/e2e-test-tune-api.yaml                | 2 +-
 test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/e2e-test-tune-api.yaml b/.github/workflows/e2e-test-tune-api.yaml
index d657928b07e..e80f3067fd6 100644
--- a/.github/workflows/e2e-test-tune-api.yaml
+++ b/.github/workflows/e2e-test-tune-api.yaml
@@ -51,7 +51,7 @@ jobs:
         if: always()  # Run this step even if previous steps fail
         run: |
           kubectl get pods -n default
-          POD_NAME=$(kubectl get pods -n default --no-headers -o custom-columns=":metadata.name" | grep tune-example-2)
+          POD_NAME=$(kubectl get pods -n default --no-headers -o custom-columns=":metadata.name" | grep tune-example-2 | grep master)
           echo "Fetching logs for pod: $POD_NAME"
           kubectl describe pod $POD_NAME -n default
           kubectl logs $POD_NAME -n default
diff --git a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py
index eeac5d10d7f..4168a8e3786 100644
--- a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py
+++ b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py
@@ -217,6 +217,7 @@ def run_e2e_experiment_create_by_tune_with_external_model(
         logging.info(f"E2E is failed for Experiment created by tune: {exp_namespace}/{exp_name}-2")
         get_experiment_pods_logs(katib_client, f"{exp_name}-2", exp_namespace)
         get_experiment_pods_logs_2(katib_client, "katib-controller", "kubeflow")
+        get_experiment_pods_logs_2(katib_client, "training-operator", "kubeflow")
         raise e
     finally:
         # Delete the Experiment.

From e45ceac4745e1317feab569256b75d5e2305bffb Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Wed, 18 Sep 2024 19:22:12 -0700
Subject: [PATCH 34/57] check the use of resources

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 .github/workflows/e2e-test-tune-api.yaml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/e2e-test-tune-api.yaml b/.github/workflows/e2e-test-tune-api.yaml
index e80f3067fd6..32cf0a12add 100644
--- a/.github/workflows/e2e-test-tune-api.yaml
+++ b/.github/workflows/e2e-test-tune-api.yaml
@@ -54,7 +54,6 @@ jobs:
           POD_NAME=$(kubectl get pods -n default --no-headers -o custom-columns=":metadata.name" | grep tune-example-2 | grep master)
           echo "Fetching logs for pod: $POD_NAME"
           kubectl describe pod $POD_NAME -n default
-          kubectl logs $POD_NAME -n default
           kubectl top pods $POD_NAME
           kubectl get events -n default | grep "tune-example-2"
         

From 4ae11edbe725c52005587091b39e3f84816641fb Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Thu, 19 Sep 2024 20:47:11 -0700
Subject: [PATCH 35/57] check the logs of container 'pytorch' and
 'storage_initializer'

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 .github/workflows/e2e-test-tune-api.yaml         |  1 -
 .../scripts/gh-actions/run-e2e-tune-api.py       | 16 ++++++++++++++--
 2 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/e2e-test-tune-api.yaml b/.github/workflows/e2e-test-tune-api.yaml
index 32cf0a12add..7b41130f499 100644
--- a/.github/workflows/e2e-test-tune-api.yaml
+++ b/.github/workflows/e2e-test-tune-api.yaml
@@ -52,7 +52,6 @@ jobs:
         run: |
           kubectl get pods -n default
           POD_NAME=$(kubectl get pods -n default --no-headers -o custom-columns=":metadata.name" | grep tune-example-2 | grep master)
-          echo "Fetching logs for pod: $POD_NAME"
           kubectl describe pod $POD_NAME -n default
           kubectl top pods $POD_NAME
           kubectl get events -n default | grep "tune-example-2"
diff --git a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py
index 4168a8e3786..e5eb39c0d4a 100644
--- a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py
+++ b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py
@@ -31,12 +31,24 @@ def get_experiment_pods_logs(katib_client: KatibClient, exp_name: str, exp_names
             logging.info(f"Fetching logs for pod: {pod.metadata.name}")
             try:
                 # Specify the container name when retrieving logs
-                pod_logs = v1.read_namespaced_pod_log(
+                pod_logs1 = v1.read_namespaced_pod_log(
                     name=pod.metadata.name,
                     namespace=exp_namespace,
                     container="metrics-logger-and-collector"  # Specify the desired container
                 )
-                logging.info(f"Logs for pod {pod.metadata.name}:\n{pod_logs}")
+                logging.info(f"Logs for pod {pod.metadata.name}:\n{pod_logs1}")
+                pod_logs2 = v1.read_namespaced_pod_log(
+                    name=pod.metadata.name,
+                    namespace=exp_namespace,
+                    container="pytorch"
+                )
+                logging.info(f"Logs for pod {pod.metadata.name}:\n{pod_logs2}")
+                pod_logs3 = v1.read_namespaced_pod_log(
+                    name=pod.metadata.name,
+                    namespace=exp_namespace,
+                    container="storage-initializer"
+                )
+                logging.info(f"Logs for pod {pod.metadata.name}:\n{pod_logs3}")
             except Exception as e:
                 logging.error(f"Failed to get logs for pod {pod.metadata.name}: {str(e)}")
 

From bedab365208cd07d45c7665c9ffe705ce6419bd3 Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Thu, 19 Sep 2024 21:44:12 -0700
Subject: [PATCH 36/57] fix error of checking use of resources

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 .github/workflows/e2e-test-tune-api.yaml            |  5 ++++-
 .../v1beta1/scripts/gh-actions/run-e2e-tune-api.py  | 13 +++++++++----
 2 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/e2e-test-tune-api.yaml b/.github/workflows/e2e-test-tune-api.yaml
index 7b41130f499..12a890d58d7 100644
--- a/.github/workflows/e2e-test-tune-api.yaml
+++ b/.github/workflows/e2e-test-tune-api.yaml
@@ -53,8 +53,11 @@ jobs:
           kubectl get pods -n default
           POD_NAME=$(kubectl get pods -n default --no-headers -o custom-columns=":metadata.name" | grep tune-example-2 | grep master)
           kubectl describe pod $POD_NAME -n default
-          kubectl top pods $POD_NAME
           kubectl get events -n default | grep "tune-example-2"
+          kubectl get apiservices | grep metrics
+          minikube addons enable metrics-server
+          kubectl get pods -n kube-system
+          kubectl top pods $POD_NAME
         
       # Step to fetch kubelet logs from Minikube
       - name: Fetch Kubelet Logs
diff --git a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py
index e5eb39c0d4a..6a125c3033a 100644
--- a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py
+++ b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py
@@ -34,21 +34,26 @@ def get_experiment_pods_logs(katib_client: KatibClient, exp_name: str, exp_names
                 pod_logs1 = v1.read_namespaced_pod_log(
                     name=pod.metadata.name,
                     namespace=exp_namespace,
-                    container="metrics-logger-and-collector"  # Specify the desired container
+                    container="metrics-logger-and-collector"
                 )
-                logging.info(f"Logs for pod {pod.metadata.name}:\n{pod_logs1}")
+                logging.info(f"Logs of metrics-logger-and-collector for pod {pod.metadata.name}:\n{pod_logs1}")
                 pod_logs2 = v1.read_namespaced_pod_log(
                     name=pod.metadata.name,
                     namespace=exp_namespace,
                     container="pytorch"
                 )
-                logging.info(f"Logs for pod {pod.metadata.name}:\n{pod_logs2}")
+                logging.info(f"Logs of pytorch for pod {pod.metadata.name}:\n{pod_logs2}")
                 pod_logs3 = v1.read_namespaced_pod_log(
                     name=pod.metadata.name,
                     namespace=exp_namespace,
                     container="storage-initializer"
                 )
-                logging.info(f"Logs for pod {pod.metadata.name}:\n{pod_logs3}")
+                logging.info(f"Logs of storage-initializer for pod {pod.metadata.name}:\n{pod_logs3}")
+                pod_logs4 = v1.read_namespaced_pod_log(
+                    name=pod.metadata.name,
+                    namespace=exp_namespace,
+                )
+                logging.info(f"Logs for pod {pod.metadata.name}:\n{pod_logs4}")
             except Exception as e:
                 logging.error(f"Failed to get logs for pod {pod.metadata.name}: {str(e)}")
 

From 7bfb3cc2df5b6d8b3ac805d70664d48c15dbeffa Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Thu, 19 Sep 2024 22:41:42 -0700
Subject: [PATCH 37/57] add other checks to find the error reason

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 .github/workflows/e2e-test-tune-api.yaml | 60 +++++++++++++++++++-----
 1 file changed, 47 insertions(+), 13 deletions(-)

diff --git a/.github/workflows/e2e-test-tune-api.yaml b/.github/workflows/e2e-test-tune-api.yaml
index 12a890d58d7..12aac4d2b36 100644
--- a/.github/workflows/e2e-test-tune-api.yaml
+++ b/.github/workflows/e2e-test-tune-api.yaml
@@ -27,48 +27,82 @@ jobs:
         run: |
           pip install "kubeflow-training[huggingface]==1.8.1"
       
-      # Step to check disk space
-      - name: Check Disk Space
+      # Step 2: Check Disk Space Before Test
+      - name: Check Disk Space Before Test
         run: |
           echo "Checking disk space usage before e2e test..."
           df -h  # Run 'df' to check free disk space
       
+      # Step 3: Run e2e test with tune API
       - name: Run e2e test with tune API
         uses: ./.github/workflows/template-e2e-test
         with:
           tune-api: true
           training-operator: true
       
-      # Step to check disk space
-      - name: Check Disk Space
+      # Step 4: Check Disk Space After Test
+      - name: Check Disk Space After Test
         if: always()  # Run this step even if previous steps fail
         run: |
           echo "Checking disk space usage after e2e test..."
           df -h  # Run 'df' to check free disk space
 
-      # Step to get logs of the relevant Experiment pod
+      # Step 5: Fetch Pod Logs for Relevant Experiment Pod
       - name: Fetch Experiment Pod Logs
         if: always()  # Run this step even if previous steps fail
         run: |
+          echo "Fetching logs for experiment pod..."
           kubectl get pods -n default
           POD_NAME=$(kubectl get pods -n default --no-headers -o custom-columns=":metadata.name" | grep tune-example-2 | grep master)
           kubectl describe pod $POD_NAME -n default
           kubectl get events -n default | grep "tune-example-2"
-          kubectl get apiservices | grep metrics
-          minikube addons enable metrics-server
-          kubectl get pods -n kube-system
-          kubectl top pods $POD_NAME
-        
-      # Step to fetch kubelet logs from Minikube
+      
+      # Step 6: Fetch kubelet logs (requires sudo for accessing kubelet logs)
       - name: Fetch Kubelet Logs
         if: always()  # Run this step even if previous steps fail
-        shell: bash
         run: |
           echo "Fetching kubelet logs..."
           sudo journalctl -u kubelet
+      
+      # Step 7: Check Node Resource Usage
+      - name: Check Node Resource Usage
+        if: always()
+        run: |
+          echo "Checking node resource usage..."
+          NODE_NAME=$(kubectl get pods -n default -o jsonpath="{.items[0].spec.nodeName}")
+          kubectl top node $NODE_NAME
+
+      # Step 8: Check Pod Resource Usage
+      - name: Check Pod Resource Usage
+        if: always()
+        run: |
+          echo "Checking pod resource usage..."
+          kubectl top pod -n default $POD_NAME
+      
+      # Step 9: Fetch Network Information for Pod
+      - name: Fetch Network Info
+        if: always()
+        run: |
+          echo "Fetching network info for pod $POD_NAME"
+          kubectl exec $POD_NAME -n default -- ip a
+
+      # Step 10: Check Docker Logs for Container
+      - name: Check Docker Logs for Container
+        if: always()
+        run: |
+          echo "Fetching Docker logs..."
+          CONTAINER_ID=$(docker ps | grep $POD_NAME | awk '{print $1}')
+          docker logs $CONTAINER_ID
+
+      # Step 11: Check Kernel Logs for OOM/Resource Issues
+      - name: Check Kernel Logs for Resource Issues
+        if: always()
+        run: |
+          echo "Checking kernel logs for resource issues..."
+          dmesg | grep -i "oom\|kill"
 
     strategy:
       fail-fast: false
       matrix:
-        # Detail: https://hub.docker.com/r/kindest/node
+        # Kubernetes versions to test with
         kubernetes-version: ["v1.27.11", "v1.28.7", "v1.29.2"]

From efffdc25abb46f9283cafe6ae72bd94d9b1dc34e Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Fri, 20 Sep 2024 23:47:19 -0700
Subject: [PATCH 38/57] set 'storage_config'

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py
index 6a125c3033a..5338b016101 100644
--- a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py
+++ b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py
@@ -176,6 +176,10 @@ def run_e2e_experiment_create_by_tune_with_external_model(
             num_procs_per_worker=1,
             resources_per_worker={"cpu": "2", "memory": "10G",},
         ),
+        storage_config={
+            "size": "10Gi",
+            "access_modes": ["ReadWriteOnce"],
+        },
     )
     experiment = katib_client.wait_for_experiment_condition(
         exp_name, exp_namespace, timeout=EXPERIMENT_TIMEOUT

From 2a18b17b0fc4236ec5cb83404e2fa0323749f4e9 Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Sat, 21 Sep 2024 20:49:34 -0700
Subject: [PATCH 39/57] reduce the number of tests

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 .github/workflows/e2e-test-tune-api.yaml | 39 +-----------------------
 1 file changed, 1 insertion(+), 38 deletions(-)

diff --git a/.github/workflows/e2e-test-tune-api.yaml b/.github/workflows/e2e-test-tune-api.yaml
index 12aac4d2b36..d59679e0173 100644
--- a/.github/workflows/e2e-test-tune-api.yaml
+++ b/.github/workflows/e2e-test-tune-api.yaml
@@ -63,46 +63,9 @@ jobs:
         run: |
           echo "Fetching kubelet logs..."
           sudo journalctl -u kubelet
-      
-      # Step 7: Check Node Resource Usage
-      - name: Check Node Resource Usage
-        if: always()
-        run: |
-          echo "Checking node resource usage..."
-          NODE_NAME=$(kubectl get pods -n default -o jsonpath="{.items[0].spec.nodeName}")
-          kubectl top node $NODE_NAME
-
-      # Step 8: Check Pod Resource Usage
-      - name: Check Pod Resource Usage
-        if: always()
-        run: |
-          echo "Checking pod resource usage..."
-          kubectl top pod -n default $POD_NAME
-      
-      # Step 9: Fetch Network Information for Pod
-      - name: Fetch Network Info
-        if: always()
-        run: |
-          echo "Fetching network info for pod $POD_NAME"
-          kubectl exec $POD_NAME -n default -- ip a
-
-      # Step 10: Check Docker Logs for Container
-      - name: Check Docker Logs for Container
-        if: always()
-        run: |
-          echo "Fetching Docker logs..."
-          CONTAINER_ID=$(docker ps | grep $POD_NAME | awk '{print $1}')
-          docker logs $CONTAINER_ID
-
-      # Step 11: Check Kernel Logs for OOM/Resource Issues
-      - name: Check Kernel Logs for Resource Issues
-        if: always()
-        run: |
-          echo "Checking kernel logs for resource issues..."
-          dmesg | grep -i "oom\|kill"
 
     strategy:
       fail-fast: false
       matrix:
         # Kubernetes versions to test with
-        kubernetes-version: ["v1.27.11", "v1.28.7", "v1.29.2"]
+        kubernetes-version: ["v1.29.2"]

From c6c964bd95372f2a9c0588f3efb03d0460e9225d Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Sat, 21 Sep 2024 21:29:32 -0700
Subject: [PATCH 40/57] Check container runtime logs

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 .github/workflows/e2e-test-tune-api.yaml | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/.github/workflows/e2e-test-tune-api.yaml b/.github/workflows/e2e-test-tune-api.yaml
index d59679e0173..115d7dc10ab 100644
--- a/.github/workflows/e2e-test-tune-api.yaml
+++ b/.github/workflows/e2e-test-tune-api.yaml
@@ -64,6 +64,12 @@ jobs:
           echo "Fetching kubelet logs..."
           sudo journalctl -u kubelet
 
+      - name: Check container runtime logs
+        if: always()  # Run this step even if previous steps fail
+        run: |
+          echo "Checking container runtime logs..."
+          sudo journalctl -u docker
+
     strategy:
       fail-fast: false
       matrix:

From 28ffb96ae59c3bf60289b4fa9155ddbe628bb12e Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Sat, 21 Sep 2024 22:48:08 -0700
Subject: [PATCH 41/57] set the driver of minikube as docker

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 .github/workflows/template-setup-e2e-test/action.yaml | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/.github/workflows/template-setup-e2e-test/action.yaml b/.github/workflows/template-setup-e2e-test/action.yaml
index 657113afc4d..69665858514 100644
--- a/.github/workflows/template-setup-e2e-test/action.yaml
+++ b/.github/workflows/template-setup-e2e-test/action.yaml
@@ -31,13 +31,6 @@ runs:
         echo "Disk usage after cleanup:"
         df -h
     
-    - name: Prune docker images
-      shell: bash
-      run: |
-        docker image prune -a -f
-        docker system df
-        df -hT
-  
     - name: Move docker data directory
       shell: bash
       run: |
@@ -66,7 +59,7 @@ runs:
       with:
         network-plugin: cni
         cni: flannel
-        driver: none
+        driver: docker
         kubernetes-version: ${{ inputs.kubernetes-version }}
         minikube-version: 1.31.1
         start-args: --wait-timeout=120s

From dc684e30d0a32d9364cc8aafcb97bf6f85ffbfaa Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Sat, 21 Sep 2024 23:19:15 -0700
Subject: [PATCH 42/57] set the driver of minikube to none

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 .github/workflows/template-setup-e2e-test/action.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/template-setup-e2e-test/action.yaml b/.github/workflows/template-setup-e2e-test/action.yaml
index 69665858514..93ac2e3fd4d 100644
--- a/.github/workflows/template-setup-e2e-test/action.yaml
+++ b/.github/workflows/template-setup-e2e-test/action.yaml
@@ -59,7 +59,7 @@ runs:
       with:
         network-plugin: cni
         cni: flannel
-        driver: docker
+        driver: none
         kubernetes-version: ${{ inputs.kubernetes-version }}
         minikube-version: 1.31.1
         start-args: --wait-timeout=120s

From a12034c749c322e1696f4de6c00aa8fd66bf00c4 Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Tue, 24 Sep 2024 13:26:37 -0700
Subject: [PATCH 43/57] check logs of pod

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 .github/workflows/e2e-test-tune-api.yaml      |  6 +-
 .../scripts/gh-actions/run-e2e-tune-api.py    | 62 +------------------
 2 files changed, 6 insertions(+), 62 deletions(-)

diff --git a/.github/workflows/e2e-test-tune-api.yaml b/.github/workflows/e2e-test-tune-api.yaml
index 115d7dc10ab..01245fab0b5 100644
--- a/.github/workflows/e2e-test-tune-api.yaml
+++ b/.github/workflows/e2e-test-tune-api.yaml
@@ -51,10 +51,14 @@ jobs:
       - name: Fetch Experiment Pod Logs
         if: always()  # Run this step even if previous steps fail
         run: |
-          echo "Fetching logs for experiment pod..."
+          echo "Fetching all the pods in the default namespace..."
           kubectl get pods -n default
           POD_NAME=$(kubectl get pods -n default --no-headers -o custom-columns=":metadata.name" | grep tune-example-2 | grep master)
+          echo "Fetching pod description for experiment pod..."
           kubectl describe pod $POD_NAME -n default
+          echo "Fetching logs for experiment pod..."
+          kubectl logs $POD_NAME -n default --all-containers
+          echo "Fetching events for experiment pod..."
           kubectl get events -n default | grep "tune-example-2"
       
       # Step 6: Fetch kubelet logs (requires sudo for accessing kubelet logs)
diff --git a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py
index 5338b016101..b13c19494fb 100644
--- a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py
+++ b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py
@@ -19,64 +19,6 @@
 # The default logging config.
 logging.basicConfig(level=logging.INFO)
 
-
-def get_experiment_pods_logs(katib_client: KatibClient, exp_name: str, exp_namespace: str):
-    # List all the pods in the namespace
-    v1 = client.CoreV1Api()
-    pods = v1.list_namespaced_pod(namespace=exp_namespace)
-    
-    # Filter pods related to the specific Katib Experiment
-    for pod in pods.items:
-        if exp_name in pod.metadata.name:
-            logging.info(f"Fetching logs for pod: {pod.metadata.name}")
-            try:
-                # Specify the container name when retrieving logs
-                pod_logs1 = v1.read_namespaced_pod_log(
-                    name=pod.metadata.name,
-                    namespace=exp_namespace,
-                    container="metrics-logger-and-collector"
-                )
-                logging.info(f"Logs of metrics-logger-and-collector for pod {pod.metadata.name}:\n{pod_logs1}")
-                pod_logs2 = v1.read_namespaced_pod_log(
-                    name=pod.metadata.name,
-                    namespace=exp_namespace,
-                    container="pytorch"
-                )
-                logging.info(f"Logs of pytorch for pod {pod.metadata.name}:\n{pod_logs2}")
-                pod_logs3 = v1.read_namespaced_pod_log(
-                    name=pod.metadata.name,
-                    namespace=exp_namespace,
-                    container="storage-initializer"
-                )
-                logging.info(f"Logs of storage-initializer for pod {pod.metadata.name}:\n{pod_logs3}")
-                pod_logs4 = v1.read_namespaced_pod_log(
-                    name=pod.metadata.name,
-                    namespace=exp_namespace,
-                )
-                logging.info(f"Logs for pod {pod.metadata.name}:\n{pod_logs4}")
-            except Exception as e:
-                logging.error(f"Failed to get logs for pod {pod.metadata.name}: {str(e)}")
-
-def get_experiment_pods_logs_2(katib_client: KatibClient, exp_name: str, exp_namespace: str):
-    # List all the pods in the namespace
-    v1 = client.CoreV1Api()
-    pods = v1.list_namespaced_pod(namespace=exp_namespace)
-    
-    # Filter pods related to the specific Katib Experiment
-    for pod in pods.items:
-        if exp_name in pod.metadata.name:
-            logging.info(f"Fetching logs for pod: {pod.metadata.name}")
-            try:
-                # Specify the container name when retrieving logs
-                pod_logs = v1.read_namespaced_pod_log(
-                    name=pod.metadata.name,
-                    namespace=exp_namespace,
-                )
-                logging.info(f"Logs for pod {pod.metadata.name} (container: metrics-logger-and-collector):\n{pod_logs}")
-            except Exception as e:
-                logging.error(f"Failed to get logs for pod {pod.metadata.name}: {str(e)}")
-
-
 # Test for Experiment created with custom objective.
 def run_e2e_experiment_create_by_tune_with_custom_objective(
     katib_client: KatibClient,
@@ -180,6 +122,7 @@ def run_e2e_experiment_create_by_tune_with_external_model(
             "size": "10Gi",
             "access_modes": ["ReadWriteOnce"],
         },
+        retain_trials=True,
     )
     experiment = katib_client.wait_for_experiment_condition(
         exp_name, exp_namespace, timeout=EXPERIMENT_TIMEOUT
@@ -236,9 +179,6 @@ def run_e2e_experiment_create_by_tune_with_external_model(
     except Exception as e:
         logging.info("---------------------------------------------------------------")
         logging.info(f"E2E is failed for Experiment created by tune: {exp_namespace}/{exp_name}-2")
-        get_experiment_pods_logs(katib_client, f"{exp_name}-2", exp_namespace)
-        get_experiment_pods_logs_2(katib_client, "katib-controller", "kubeflow")
-        get_experiment_pods_logs_2(katib_client, "training-operator", "kubeflow")
         raise e
     finally:
         # Delete the Experiment.

From b0888155794af54749b4cca94c67d1fb7f8b6d81 Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Sun, 29 Sep 2024 10:50:20 -0700
Subject: [PATCH 44/57] check memory usage

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 .github/workflows/e2e-test-tune-api.yaml | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/.github/workflows/e2e-test-tune-api.yaml b/.github/workflows/e2e-test-tune-api.yaml
index 01245fab0b5..7f3ceea9884 100644
--- a/.github/workflows/e2e-test-tune-api.yaml
+++ b/.github/workflows/e2e-test-tune-api.yaml
@@ -33,13 +33,32 @@ jobs:
           echo "Checking disk space usage before e2e test..."
           df -h  # Run 'df' to check free disk space
       
+      - name: Monitor Memory Usage Before Run
+        if: always()
+        run: free -h
+      
+      - name: Monitor Docker Container Memory Usage
+        if: always()  
+        run: |
+          docker stats --no-stream
+      
       # Step 3: Run e2e test with tune API
       - name: Run e2e test with tune API
+        if: always()
         uses: ./.github/workflows/template-e2e-test
         with:
           tune-api: true
           training-operator: true
       
+      - name: Monitor Memory Usage After Run
+        if: always()
+        run: free -h
+      
+      - name: Monitor Docker Container Memory Usage
+        if: always()  
+        run: |
+          docker stats --no-stream
+        
       # Step 4: Check Disk Space After Test
       - name: Check Disk Space After Test
         if: always()  # Run this step even if previous steps fail

From e468b27a978ed94fd7396ead4037b76a9482d301 Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Sun, 29 Sep 2024 11:59:38 -0700
Subject: [PATCH 45/57] increase 'termination_grace_period_seconds' in podspec

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 .github/workflows/e2e-test-tune-api.yaml      |  2 ++
 .../kubeflow/katib/api/katib_client.py        | 26 ++++++++++++++-----
 2 files changed, 21 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/e2e-test-tune-api.yaml b/.github/workflows/e2e-test-tune-api.yaml
index 7f3ceea9884..b1c2c64dba0 100644
--- a/.github/workflows/e2e-test-tune-api.yaml
+++ b/.github/workflows/e2e-test-tune-api.yaml
@@ -30,6 +30,8 @@ jobs:
       # Step 2: Check Disk Space Before Test
       - name: Check Disk Space Before Test
         run: |
+          docker system prune -a
+          docker volume prune
           echo "Checking disk space usage before e2e test..."
           df -h  # Run 'df' to check free disk space
       
diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py
index 49c5d88e584..5db30c13387 100644
--- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py
+++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py
@@ -656,15 +656,27 @@ class name in this argument.
                 ),
             )
 
-            worker_pod_template_spec = training_utils.get_pod_template_spec(
-                containers=[container_spec],
-                volumes=[storage_initializer_volume],
+            worker_pod_template_spec = models.V1PodTemplateSpec(
+                metadata=models.V1ObjectMeta(
+                    annotations={constants.ISTIO_SIDECAR_INJECTION: "false"}
+                ),
+                spec=models.V1PodSpec(
+                    containers=[container_spec],
+                    volumes=[storage_initializer_volume],
+                    termination_grace_period_seconds=60,
+                ),
             )
 
-            master_pod_template_spec = training_utils.get_pod_template_spec(
-                containers=[container_spec],
-                init_containers=[init_container_spec],
-                volumes=[storage_initializer_volume],
+            master_pod_template_spec = models.V1PodTemplateSpec(
+                metadata=models.V1ObjectMeta(
+                    annotations={constants.ISTIO_SIDECAR_INJECTION: "false"}
+                ),
+                spec=models.V1PodSpec(
+                    init_containers=[init_container_spec],
+                    containers=[container_spec],
+                    volumes=[storage_initializer_volume],
+                    termination_grace_period_seconds=60,
+                ),
             )
 
             # Create PyTorchJob.

From 64d8fef5062d6f4986028fe07de346da42bb16a2 Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Sun, 29 Sep 2024 12:45:20 -0700
Subject: [PATCH 46/57] fix annotations error

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 sdk/python/v1beta1/kubeflow/katib/api/katib_client.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py
index 5db30c13387..cb95b20f31b 100644
--- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py
+++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py
@@ -658,7 +658,7 @@ class name in this argument.
 
             worker_pod_template_spec = models.V1PodTemplateSpec(
                 metadata=models.V1ObjectMeta(
-                    annotations={constants.ISTIO_SIDECAR_INJECTION: "false"}
+                    annotations={"sidecar.istio.io/inject": "false"}
                 ),
                 spec=models.V1PodSpec(
                     containers=[container_spec],
@@ -669,7 +669,7 @@ class name in this argument.
 
             master_pod_template_spec = models.V1PodTemplateSpec(
                 metadata=models.V1ObjectMeta(
-                    annotations={constants.ISTIO_SIDECAR_INJECTION: "false"}
+                    annotations={"sidecar.istio.io/inject": "false"}
                 ),
                 spec=models.V1PodSpec(
                     init_containers=[init_container_spec],

From 45db42e73b955d04a21d0931fc41819c1ba2f9c6 Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Sun, 29 Sep 2024 20:25:49 -0700
Subject: [PATCH 47/57] restart docker

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 .github/workflows/e2e-test-tune-api.yaml | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/.github/workflows/e2e-test-tune-api.yaml b/.github/workflows/e2e-test-tune-api.yaml
index b1c2c64dba0..282e1b402fe 100644
--- a/.github/workflows/e2e-test-tune-api.yaml
+++ b/.github/workflows/e2e-test-tune-api.yaml
@@ -43,6 +43,14 @@ jobs:
         if: always()  
         run: |
           docker stats --no-stream
+
+      - name: Restart Docker Service
+        run: |
+          echo "Restarting Docker service..."
+          sudo systemctl restart docker
+          echo "Docker service status:"
+          sudo systemctl --no-pager -l -o short status docker
+          kubectl get pods -n kubeflow
       
       # Step 3: Run e2e test with tune API
       - name: Run e2e test with tune API

From c6e91cdf47a0a3596b7dea414be279cd1c9201a8 Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Sun, 29 Sep 2024 20:45:38 -0700
Subject: [PATCH 48/57] delete restarting docker

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 .github/workflows/e2e-test-tune-api.yaml | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/.github/workflows/e2e-test-tune-api.yaml b/.github/workflows/e2e-test-tune-api.yaml
index 282e1b402fe..f141d034e57 100644
--- a/.github/workflows/e2e-test-tune-api.yaml
+++ b/.github/workflows/e2e-test-tune-api.yaml
@@ -27,7 +27,6 @@ jobs:
         run: |
           pip install "kubeflow-training[huggingface]==1.8.1"
       
-      # Step 2: Check Disk Space Before Test
       - name: Check Disk Space Before Test
         run: |
           docker system prune -a
@@ -43,16 +42,7 @@ jobs:
         if: always()  
         run: |
           docker stats --no-stream
-
-      - name: Restart Docker Service
-        run: |
-          echo "Restarting Docker service..."
-          sudo systemctl restart docker
-          echo "Docker service status:"
-          sudo systemctl --no-pager -l -o short status docker
-          kubectl get pods -n kubeflow
       
-      # Step 3: Run e2e test with tune API
       - name: Run e2e test with tune API
         if: always()
         uses: ./.github/workflows/template-e2e-test
@@ -69,14 +59,12 @@ jobs:
         run: |
           docker stats --no-stream
         
-      # Step 4: Check Disk Space After Test
       - name: Check Disk Space After Test
         if: always()  # Run this step even if previous steps fail
         run: |
           echo "Checking disk space usage after e2e test..."
           df -h  # Run 'df' to check free disk space
 
-      # Step 5: Fetch Pod Logs for Relevant Experiment Pod
       - name: Fetch Experiment Pod Logs
         if: always()  # Run this step even if previous steps fail
         run: |
@@ -90,7 +78,6 @@ jobs:
           echo "Fetching events for experiment pod..."
           kubectl get events -n default | grep "tune-example-2"
       
-      # Step 6: Fetch kubelet logs (requires sudo for accessing kubelet logs)
       - name: Fetch Kubelet Logs
         if: always()  # Run this step even if previous steps fail
         run: |

From b1a2390b6542208b1dea824eed2f276a7f79dbd8 Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Mon, 21 Oct 2024 23:30:57 -0700
Subject: [PATCH 49/57] use original docker data directory

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 .github/workflows/e2e-test-tune-api.yaml       |  6 ++++++
 .../template-setup-e2e-test/action.yaml        | 18 ------------------
 2 files changed, 6 insertions(+), 18 deletions(-)

diff --git a/.github/workflows/e2e-test-tune-api.yaml b/.github/workflows/e2e-test-tune-api.yaml
index f141d034e57..9111f2f1982 100644
--- a/.github/workflows/e2e-test-tune-api.yaml
+++ b/.github/workflows/e2e-test-tune-api.yaml
@@ -50,6 +50,12 @@ jobs:
           tune-api: true
           training-operator: true
       
+      - name: Get YAML file of Experiment
+        if: always()
+        run: |
+          echo "Fetching the YAML file of the experiment..."
+          kubectl get experiment tune-example-2 -n default -o yaml
+      
       - name: Monitor Memory Usage After Run
         if: always()
         run: free -h
diff --git a/.github/workflows/template-setup-e2e-test/action.yaml b/.github/workflows/template-setup-e2e-test/action.yaml
index 93ac2e3fd4d..75ee040aea2 100644
--- a/.github/workflows/template-setup-e2e-test/action.yaml
+++ b/.github/workflows/template-setup-e2e-test/action.yaml
@@ -30,24 +30,6 @@ runs:
 
         echo "Disk usage after cleanup:"
         df -h
-    
-    - name: Move docker data directory
-      shell: bash
-      run: |
-        echo "Stopping docker service ..."
-        sudo systemctl stop docker
-        DOCKER_DEFAULT_ROOT_DIR=/var/lib/docker
-        DOCKER_ROOT_DIR=/mnt/docker
-        echo "Moving ${DOCKER_DEFAULT_ROOT_DIR} -> ${DOCKER_ROOT_DIR}"
-        sudo mv ${DOCKER_DEFAULT_ROOT_DIR} ${DOCKER_ROOT_DIR}
-        echo "Creating symlink ${DOCKER_DEFAULT_ROOT_DIR} -> ${DOCKER_ROOT_DIR}"
-        sudo ln -s ${DOCKER_ROOT_DIR} ${DOCKER_DEFAULT_ROOT_DIR}
-        echo "$(sudo ls -l ${DOCKER_DEFAULT_ROOT_DIR})"
-        echo "Starting docker service ..."
-        sudo systemctl daemon-reload
-        sudo systemctl start docker
-        echo "Docker service status:"
-        sudo systemctl --no-pager -l -o short status docker  
 
     - name: Setup kubectl
       uses: azure/setup-kubectl@v4

From e5bf8401990835778e9cab7e75f194f7aa54551c Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Thu, 23 Jan 2025 00:14:50 -0800
Subject: [PATCH 50/57] update installation of Katib SDK with extra requires

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 .github/workflows/e2e-test-tune-api.yaml      |  8 +--
 .../kubeflow/katib/api/katib_client.py        | 49 ++++++++++---------
 2 files changed, 30 insertions(+), 27 deletions(-)

diff --git a/.github/workflows/e2e-test-tune-api.yaml b/.github/workflows/e2e-test-tune-api.yaml
index 9111f2f1982..86cb786647f 100644
--- a/.github/workflows/e2e-test-tune-api.yaml
+++ b/.github/workflows/e2e-test-tune-api.yaml
@@ -22,10 +22,10 @@ jobs:
         with:
           kubernetes-version: ${{ matrix.kubernetes-version }}
       
-      - name: Install Training Operator SDK
+      - name: Install Katib SDK with extra requires
         shell: bash
         run: |
-          pip install "kubeflow-training[huggingface]==1.8.1"
+          pip install --prefer-binary -e 'sdk/python/v1beta1[huggingface]'
       
       - name: Check Disk Space Before Test
         run: |
@@ -99,5 +99,5 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        # Kubernetes versions to test with
-        kubernetes-version: ["v1.29.2"]
+        # Detail: https://hub.docker.com/r/kindest/node
+        kubernetes-version: ["v1.29.2", "v1.30.7", "v1.31.3"]
diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py
index cb95b20f31b..e5d19da8256 100644
--- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py
+++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py
@@ -415,7 +415,9 @@ class name in this argument.
             experiment.spec.max_failed_trial_count = max_failed_trial_count
 
         # If users choose to use a custom objective function.
-        if objective is not None:
+        if objective is not None or parameters is not None:
+            if not objective or not parameters:
+                raise ValueError("One of the required parameters is None")
             # Add metrics collector to the Katib Experiment.
             # Up to now, we only support parameter `kind`, of which default value
             # is `StdOut`, to specify the kind of metrics collector.
@@ -518,6 +520,7 @@ class name in this argument.
                 from kubeflow.storage_initializer.hugging_face import (
                     HuggingFaceDatasetParams,
                     HuggingFaceModelParams,
+                    HuggingFaceTrainerParams,
                 )
                 from kubeflow.storage_initializer.s3 import S3DatasetParams
                 from kubeflow.training import models as training_models
@@ -596,6 +599,11 @@ class name in this argument.
                     "or HuggingFaceDatasetParams."
                 )
 
+            if not isinstance(trainer_parameters, HuggingFaceTrainerParams):
+                raise ValueError(
+                    "Trainer parameters must be an instance of HuggingFaceTrainerParams."
+                )
+
             # Iterate over input parameters and do substitutions.
             experiment_params = []
             trial_params = []
@@ -645,7 +653,11 @@ class name in this argument.
                     f"'{training_args}'",
                 ],
                 volume_mounts=[STORAGE_INITIALIZER_VOLUME_MOUNT],
-                resources=resources_per_trial.resources_per_worker,
+                resources=(
+                    resources_per_trial.resources_per_worker
+                    if resources_per_trial
+                    else None
+                ),
             )
 
             # Create the worker and the master pod.
@@ -656,27 +668,15 @@ class name in this argument.
                 ),
             )
 
-            worker_pod_template_spec = models.V1PodTemplateSpec(
-                metadata=models.V1ObjectMeta(
-                    annotations={"sidecar.istio.io/inject": "false"}
-                ),
-                spec=models.V1PodSpec(
-                    containers=[container_spec],
-                    volumes=[storage_initializer_volume],
-                    termination_grace_period_seconds=60,
-                ),
+            worker_pod_template_spec = training_utils.get_pod_template_spec(
+                containers=[container_spec],
+                volumes=[storage_initializer_volume],
             )
 
-            master_pod_template_spec = models.V1PodTemplateSpec(
-                metadata=models.V1ObjectMeta(
-                    annotations={"sidecar.istio.io/inject": "false"}
-                ),
-                spec=models.V1PodSpec(
-                    init_containers=[init_container_spec],
-                    containers=[container_spec],
-                    volumes=[storage_initializer_volume],
-                    termination_grace_period_seconds=60,
-                ),
+            master_pod_template_spec = training_utils.get_pod_template_spec(
+                containers=[container_spec],
+                init_containers=[init_container_spec],
+                volumes=[storage_initializer_volume],
             )
 
             # Create PyTorchJob.
@@ -691,7 +691,10 @@ class name in this argument.
                 ),
             )
 
-            if resources_per_trial.num_procs_per_worker:
+            if (
+                resources_per_trial is not None
+                and resources_per_trial.num_procs_per_worker
+            ):
                 pytorchjob.spec.nproc_per_node = str(
                     resources_per_trial.num_procs_per_worker
                 )
@@ -703,7 +706,7 @@ class name in this argument.
                 )
             )
 
-            if resources_per_trial.num_workers > 1:
+            if resources_per_trial is not None and resources_per_trial.num_workers > 1:
                 pytorchjob.spec.pytorch_replica_specs["Worker"] = (
                     training_models.KubeflowOrgV1ReplicaSpec(
                         replicas=resources_per_trial.num_workers - 1,

From fca94ae148b2f4504dec209d84158e9f25e62df5 Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Thu, 23 Jan 2025 00:55:06 -0800
Subject: [PATCH 51/57] test trainer image built with cpu

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 sdk/python/v1beta1/kubeflow/katib/api/katib_client.py   | 2 +-
 test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py
index e5d19da8256..cbd4f80d1f2 100644
--- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py
+++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py
@@ -635,7 +635,7 @@ class name in this argument.
 
             container_spec = training_utils.get_container_spec(
                 name=JOB_PARAMETERS[PYTORCHJOB_KIND]["container"],
-                base_image=TRAINER_TRANSFORMER_IMAGE,
+                base_image="docker.io/helenxiehz428/trainer:test",
                 args=[
                     "--model_uri",
                     model_provider_parameters.model_uri,
diff --git a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py
index b13c19494fb..6ee1b19d68b 100644
--- a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py
+++ b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py
@@ -14,7 +14,7 @@
 from verify import verify_experiment_results
 
 # Experiment timeout is 40 min.
-EXPERIMENT_TIMEOUT = 60 * 10
+EXPERIMENT_TIMEOUT = 60 * 15
 
 # The default logging config.
 logging.basicConfig(level=logging.INFO)

From a785d353a1feb1f3959a2bbd306269585ed2d207 Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Fri, 24 Jan 2025 14:42:59 -0800
Subject: [PATCH 52/57] add action of free up disk space (including move docker
 data directory)

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 .github/workflows/e2e-test-tune-api.yaml      | 24 ---------
 .../workflows/free-up-disk-space/action.yaml  | 49 +++++++++++++++++++
 .../template-setup-e2e-test/action.yaml       | 15 +-----
 .../kubeflow/katib/api/katib_client.py        |  2 +-
 4 files changed, 52 insertions(+), 38 deletions(-)
 create mode 100644 .github/workflows/free-up-disk-space/action.yaml

diff --git a/.github/workflows/e2e-test-tune-api.yaml b/.github/workflows/e2e-test-tune-api.yaml
index 86cb786647f..54550b03542 100644
--- a/.github/workflows/e2e-test-tune-api.yaml
+++ b/.github/workflows/e2e-test-tune-api.yaml
@@ -34,36 +34,12 @@ jobs:
           echo "Checking disk space usage before e2e test..."
           df -h  # Run 'df' to check free disk space
       
-      - name: Monitor Memory Usage Before Run
-        if: always()
-        run: free -h
-      
-      - name: Monitor Docker Container Memory Usage
-        if: always()  
-        run: |
-          docker stats --no-stream
-      
       - name: Run e2e test with tune API
         if: always()
         uses: ./.github/workflows/template-e2e-test
         with:
           tune-api: true
           training-operator: true
-      
-      - name: Get YAML file of Experiment
-        if: always()
-        run: |
-          echo "Fetching the YAML file of the experiment..."
-          kubectl get experiment tune-example-2 -n default -o yaml
-      
-      - name: Monitor Memory Usage After Run
-        if: always()
-        run: free -h
-      
-      - name: Monitor Docker Container Memory Usage
-        if: always()  
-        run: |
-          docker stats --no-stream
         
       - name: Check Disk Space After Test
         if: always()  # Run this step even if previous steps fail
diff --git a/.github/workflows/free-up-disk-space/action.yaml b/.github/workflows/free-up-disk-space/action.yaml
new file mode 100644
index 00000000000..110e3a21b84
--- /dev/null
+++ b/.github/workflows/free-up-disk-space/action.yaml
@@ -0,0 +1,49 @@
+name: Free-Up Disk Space
+description: Remove Non-Essential Tools And Move Docker Data Directory to /mnt/docker
+
+runs:
+  using: composite
+  steps:
+    # This step is a Workaround to avoid the "No space left on device" error.
+    # ref: https://github.com/actions/runner-images/issues/2840
+    - name: Remove unnecessary files
+      shell: bash
+      run: |
+        echo "Disk usage before cleanup:"
+        df -hT
+
+        sudo rm -rf /usr/share/dotnet
+        sudo rm -rf /opt/ghc
+        sudo rm -rf /usr/local/share/boost
+        sudo rm -rf "$AGENT_TOOLSDIRECTORY"
+        sudo rm -rf /usr/local/lib/android
+        sudo rm -rf /usr/local/share/powershell
+        sudo rm -rf /usr/share/swift
+
+        echo "Disk usage after cleanup:"
+        df -hT
+
+    - name: Prune docker images
+      shell: bash
+      run: |
+        docker image prune -a -f
+        docker system df
+        df -hT
+
+    - name: Move docker data directory
+      shell: bash
+      run: |
+        echo "Stopping docker service ..."
+        sudo systemctl stop docker
+        DOCKER_DEFAULT_ROOT_DIR=/var/lib/docker
+        DOCKER_ROOT_DIR=/mnt/docker
+        echo "Moving ${DOCKER_DEFAULT_ROOT_DIR} -> ${DOCKER_ROOT_DIR}"
+        sudo mv ${DOCKER_DEFAULT_ROOT_DIR} ${DOCKER_ROOT_DIR}
+        echo "Creating symlink ${DOCKER_DEFAULT_ROOT_DIR} -> ${DOCKER_ROOT_DIR}"
+        sudo ln -s ${DOCKER_ROOT_DIR} ${DOCKER_DEFAULT_ROOT_DIR}
+        echo "$(sudo ls -l ${DOCKER_DEFAULT_ROOT_DIR})"
+        echo "Starting docker service ..."
+        sudo systemctl daemon-reload
+        sudo systemctl start docker
+        echo "Docker service status:"
+        sudo systemctl --no-pager -l -o short status docker
\ No newline at end of file
diff --git a/.github/workflows/template-setup-e2e-test/action.yaml b/.github/workflows/template-setup-e2e-test/action.yaml
index 561f127648a..f85697fe3d5 100644
--- a/.github/workflows/template-setup-e2e-test/action.yaml
+++ b/.github/workflows/template-setup-e2e-test/action.yaml
@@ -17,19 +17,8 @@ runs:
   steps:
     # This step is a Workaround to avoid the "No space left on device" error.
     # ref: https://github.com/actions/runner-images/issues/2840
-    - name: Remove unnecessary files
-      shell: bash
-      run: |
-        sudo rm -rf /usr/share/dotnet
-        sudo rm -rf /opt/ghc
-        sudo rm -rf "/usr/local/share/boost"
-        sudo rm -rf "$AGENT_TOOLSDIRECTORY"
-        sudo rm -rf /usr/local/lib/android
-        sudo rm -rf /usr/local/share/powershell
-        sudo rm -rf /usr/share/swift
-
-        echo "Disk usage after cleanup:"
-        df -h
+    - name: Free-Up Disk Space
+      uses: ./.github/workflows/free-up-disk-space
 
     - name: Setup kubectl
       uses: azure/setup-kubectl@v4
diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py
index 82a3712cda1..b641800290f 100644
--- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py
+++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py
@@ -635,7 +635,7 @@ class name in this argument.
 
             container_spec = training_utils.get_container_spec(
                 name=JOB_PARAMETERS[PYTORCHJOB_KIND]["container"],
-                base_image="docker.io/helenxiehz428/trainer:test",
+                base_image=TRAINER_TRANSFORMER_IMAGE,
                 args=[
                     "--model_uri",
                     model_provider_parameters.model_uri,

From 865379e37ec5200a65593a61f1cf1aedf79d9940 Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Fri, 24 Jan 2025 15:59:59 -0800
Subject: [PATCH 53/57] delete unnecessary checks and update the part of
 fetching pod description and logs

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 .github/workflows/e2e-test-tune-api.yaml      | 57 ++++++++-----------
 .../scripts/gh-actions/run-e2e-tune-api.py    | 14 +----
 2 files changed, 25 insertions(+), 46 deletions(-)

diff --git a/.github/workflows/e2e-test-tune-api.yaml b/.github/workflows/e2e-test-tune-api.yaml
index 54550b03542..753802788bb 100644
--- a/.github/workflows/e2e-test-tune-api.yaml
+++ b/.github/workflows/e2e-test-tune-api.yaml
@@ -27,50 +27,39 @@ jobs:
         run: |
           pip install --prefer-binary -e 'sdk/python/v1beta1[huggingface]'
       
-      - name: Check Disk Space Before Test
-        run: |
-          docker system prune -a
-          docker volume prune
-          echo "Checking disk space usage before e2e test..."
-          df -h  # Run 'df' to check free disk space
-      
       - name: Run e2e test with tune API
-        if: always()
         uses: ./.github/workflows/template-e2e-test
         with:
           tune-api: true
           training-operator: true
-        
-      - name: Check Disk Space After Test
-        if: always()  # Run this step even if previous steps fail
-        run: |
-          echo "Checking disk space usage after e2e test..."
-          df -h  # Run 'df' to check free disk space
 
-      - name: Fetch Experiment Pod Logs
-        if: always()  # Run this step even if previous steps fail
+      - name: Fetch Pod Description and Logs for Experiment # This step is added to debug the test failure
+        if: always()
         run: |
-          echo "Fetching all the pods in the default namespace..."
+          echo "Fetching all the pods..."
           kubectl get pods -n default
-          POD_NAME=$(kubectl get pods -n default --no-headers -o custom-columns=":metadata.name" | grep tune-example-2 | grep master)
-          echo "Fetching pod description for experiment pod..."
-          kubectl describe pod $POD_NAME -n default
-          echo "Fetching logs for experiment pod..."
-          kubectl logs $POD_NAME -n default --all-containers
-          echo "Fetching events for experiment pod..."
-          kubectl get events -n default | grep "tune-example-2"
-      
-      - name: Fetch Kubelet Logs
-        if: always()  # Run this step even if previous steps fail
-        run: |
-          echo "Fetching kubelet logs..."
-          sudo journalctl -u kubelet
 
-      - name: Check container runtime logs
-        if: always()  # Run this step even if previous steps fail
+          POD_NAME_1=$(kubectl get pods -n default --no-headers -o custom-columns=":metadata.name" | grep tune-example-1 | grep master)
+
+          echo "Fetching pod description for tune-example-1..."
+          kubectl describe pod $POD_NAME_1 -n default
+
+          echo "Fetching pod logs for tune-example-1..."
+          kubectl logs $POD_NAME_1 -n default --all-containers
+
+          POD_NAME_2=$(kubectl get pods -n default --no-headers -o custom-columns=":metadata.name" | grep tune-example-2 | grep master)
+
+          echo "Fetching pod description for tune-example-2..."
+          kubectl describe pod $POD_NAME_2 -n default
+
+          echo "Fetching pod logs for tune-example-2..."
+          kubectl logs $POD_NAME_2 -n default --all-containers
+      
+      - name: Delete Experiment for e2e test
+        if: always()
         run: |
-          echo "Checking container runtime logs..."
-          sudo journalctl -u docker
+          kubectl delete experiment tune-example-1 -n default
+          kubectl delete experiment tune-example-2 -n default
 
     strategy:
       fail-fast: false
diff --git a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py
index 6ee1b19d68b..7a98a93c20e 100644
--- a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py
+++ b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py
@@ -14,12 +14,12 @@
 from verify import verify_experiment_results
 
 # Experiment timeout is 40 min.
-EXPERIMENT_TIMEOUT = 60 * 15
+EXPERIMENT_TIMEOUT = 60 * 40
 
 # The default logging config.
 logging.basicConfig(level=logging.INFO)
 
-# Test for Experiment created with custom objective.
+# Test for Experiment created with custom objective function.
 def run_e2e_experiment_create_by_tune_with_custom_objective(
     katib_client: KatibClient,
     exp_name: str,
@@ -166,11 +166,6 @@ def run_e2e_experiment_create_by_tune_with_external_model(
         logging.info("---------------------------------------------------------------")
         logging.info(f"E2E is failed for Experiment created by tune: {exp_namespace}/{exp_name}-1")
         raise e
-    finally:
-        # Delete the Experiment.
-        logging.info("---------------------------------------------------------------")
-        logging.info("---------------------------------------------------------------")
-        #katib_client.delete_experiment(f"{exp_name}-1", exp_namespace)
 
     try:
         run_e2e_experiment_create_by_tune_with_external_model(katib_client, f"{exp_name}-2", exp_namespace)
@@ -180,8 +175,3 @@ def run_e2e_experiment_create_by_tune_with_external_model(
         logging.info("---------------------------------------------------------------")
         logging.info(f"E2E is failed for Experiment created by tune: {exp_namespace}/{exp_name}-2")
         raise e
-    finally:
-        # Delete the Experiment.
-        logging.info("---------------------------------------------------------------")
-        logging.info("---------------------------------------------------------------")
-        #katib_client.delete_experiment(f"{exp_name}-2", exp_namespace)

From d1ea629f77fd1d0228a3b35a125fd973ce1f7db6 Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Fri, 24 Jan 2025 16:27:37 -0800
Subject: [PATCH 54/57] delete fetching pod logs

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 .github/workflows/e2e-test-tune-api.yaml      | 28 -------------------
 .../scripts/gh-actions/run-e2e-tune-api.py    | 10 +++++++
 2 files changed, 10 insertions(+), 28 deletions(-)

diff --git a/.github/workflows/e2e-test-tune-api.yaml b/.github/workflows/e2e-test-tune-api.yaml
index 753802788bb..186a0983b81 100644
--- a/.github/workflows/e2e-test-tune-api.yaml
+++ b/.github/workflows/e2e-test-tune-api.yaml
@@ -33,34 +33,6 @@ jobs:
           tune-api: true
           training-operator: true
 
-      - name: Fetch Pod Description and Logs for Experiment # This step is added to debug the test failure
-        if: always()
-        run: |
-          echo "Fetching all the pods..."
-          kubectl get pods -n default
-
-          POD_NAME_1=$(kubectl get pods -n default --no-headers -o custom-columns=":metadata.name" | grep tune-example-1 | grep master)
-
-          echo "Fetching pod description for tune-example-1..."
-          kubectl describe pod $POD_NAME_1 -n default
-
-          echo "Fetching pod logs for tune-example-1..."
-          kubectl logs $POD_NAME_1 -n default --all-containers
-
-          POD_NAME_2=$(kubectl get pods -n default --no-headers -o custom-columns=":metadata.name" | grep tune-example-2 | grep master)
-
-          echo "Fetching pod description for tune-example-2..."
-          kubectl describe pod $POD_NAME_2 -n default
-
-          echo "Fetching pod logs for tune-example-2..."
-          kubectl logs $POD_NAME_2 -n default --all-containers
-      
-      - name: Delete Experiment for e2e test
-        if: always()
-        run: |
-          kubectl delete experiment tune-example-1 -n default
-          kubectl delete experiment tune-example-2 -n default
-
     strategy:
       fail-fast: false
       matrix:
diff --git a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py
index 7a98a93c20e..9e2cb732343 100644
--- a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py
+++ b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py
@@ -166,6 +166,11 @@ def run_e2e_experiment_create_by_tune_with_external_model(
         logging.info("---------------------------------------------------------------")
         logging.info(f"E2E is failed for Experiment created by tune: {exp_namespace}/{exp_name}-1")
         raise e
+    finally:
+        # Delete the Experiment.
+        logging.info("---------------------------------------------------------------")
+        logging.info("---------------------------------------------------------------")
+        katib_client.delete_experiment(f"{exp_name}-1", exp_namespace)
 
     try:
         run_e2e_experiment_create_by_tune_with_external_model(katib_client, f"{exp_name}-2", exp_namespace)
@@ -175,3 +180,8 @@ def run_e2e_experiment_create_by_tune_with_external_model(
         logging.info("---------------------------------------------------------------")
         logging.info(f"E2E is failed for Experiment created by tune: {exp_namespace}/{exp_name}-2")
         raise e
+    finally:
+        # Delete the Experiment.
+        logging.info("---------------------------------------------------------------")
+        logging.info("---------------------------------------------------------------")
+        katib_client.delete_experiment(f"{exp_name}-2", exp_namespace)

From 5e2e44f9c869d0163c6fb395222489332def62f7 Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Mon, 27 Jan 2025 09:55:11 -0800
Subject: [PATCH 55/57] add blank line at the end of free-up-disk-space yaml
 file

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 .github/workflows/free-up-disk-space/action.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/free-up-disk-space/action.yaml b/.github/workflows/free-up-disk-space/action.yaml
index 110e3a21b84..c85e44e8c58 100644
--- a/.github/workflows/free-up-disk-space/action.yaml
+++ b/.github/workflows/free-up-disk-space/action.yaml
@@ -46,4 +46,4 @@ runs:
         sudo systemctl daemon-reload
         sudo systemctl start docker
         echo "Docker service status:"
-        sudo systemctl --no-pager -l -o short status docker
\ No newline at end of file
+        sudo systemctl --no-pager -l -o short status docker

From 982e2687d4e1de5e06fbdf563863107ef81a2a66 Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Mon, 27 Jan 2025 09:57:25 -0800
Subject: [PATCH 56/57] update experiment name

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 .../scripts/gh-actions/run-e2e-tune-api.py    | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py
index 9e2cb732343..aaa6e074d56 100644
--- a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py
+++ b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py
@@ -156,32 +156,33 @@ def run_e2e_experiment_create_by_tune_with_external_model(
         client.CoreV1Api().patch_namespace(args.namespace, {'metadata': {'labels': namespace_labels}})
 
     # Test with run_e2e_experiment_create_by_tune
-    exp_name = "tune-example"
+    exp_name_custom_objective = "tune-example-1"
+    exp_name_llm_optimization = "tune-example-2"
     exp_namespace = args.namespace
     try:
-        run_e2e_experiment_create_by_tune_with_custom_objective(katib_client, f"{exp_name}-1", exp_namespace)
+        run_e2e_experiment_create_by_tune_with_custom_objective(katib_client, exp_name_custom_objective, exp_namespace)
         logging.info("---------------------------------------------------------------")
-        logging.info(f"E2E is succeeded for Experiment created by tune: {exp_namespace}/{exp_name}-1")
+        logging.info(f"E2E is succeeded for Experiment created by tune: {exp_namespace}/{exp_name_custom_objective}")
     except Exception as e:
         logging.info("---------------------------------------------------------------")
-        logging.info(f"E2E is failed for Experiment created by tune: {exp_namespace}/{exp_name}-1")
+        logging.info(f"E2E is failed for Experiment created by tune: {exp_namespace}/{exp_name_custom_objective}")
         raise e
     finally:
         # Delete the Experiment.
         logging.info("---------------------------------------------------------------")
         logging.info("---------------------------------------------------------------")
-        katib_client.delete_experiment(f"{exp_name}-1", exp_namespace)
+        katib_client.delete_experiment(exp_name_custom_objective, exp_namespace)
 
     try:
-        run_e2e_experiment_create_by_tune_with_external_model(katib_client, f"{exp_name}-2", exp_namespace)
+        run_e2e_experiment_create_by_tune_with_external_model(katib_client, exp_name_llm_optimization, exp_namespace)
         logging.info("---------------------------------------------------------------")
-        logging.info(f"E2E is succeeded for Experiment created by tune: {exp_namespace}/{exp_name}-2")
+        logging.info(f"E2E is succeeded for Experiment created by tune: {exp_namespace}/{exp_name_llm_optimization}")
     except Exception as e:
         logging.info("---------------------------------------------------------------")
-        logging.info(f"E2E is failed for Experiment created by tune: {exp_namespace}/{exp_name}-2")
+        logging.info(f"E2E is failed for Experiment created by tune: {exp_namespace}/{exp_name_llm_optimization}")
         raise e
     finally:
         # Delete the Experiment.
         logging.info("---------------------------------------------------------------")
         logging.info("---------------------------------------------------------------")
-        katib_client.delete_experiment(f"{exp_name}-2", exp_namespace)
+        katib_client.delete_experiment(exp_name_llm_optimization, exp_namespace)

From 55c404d691276695b08ccb24c84fbc04fb0be66f Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Mon, 27 Jan 2025 09:59:01 -0800
Subject: [PATCH 57/57] update test function name to be consistent with
 experiment name

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py
index aaa6e074d56..b9302d4f8fa 100644
--- a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py
+++ b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py
@@ -66,7 +66,7 @@ def objective(parameters):
     logging.debug(katib_client.get_suggestion(exp_name, exp_namespace))
 
 # Test for Experiment created with external models and datasets.
-def run_e2e_experiment_create_by_tune_with_external_model(
+def run_e2e_experiment_create_by_tune_with_llm_optimization(
     katib_client: KatibClient,
     exp_name: str,
     exp_namespace: str,
@@ -174,7 +174,7 @@ def run_e2e_experiment_create_by_tune_with_external_model(
         katib_client.delete_experiment(exp_name_custom_objective, exp_namespace)
 
     try:
-        run_e2e_experiment_create_by_tune_with_external_model(katib_client, exp_name_llm_optimization, exp_namespace)
+        run_e2e_experiment_create_by_tune_with_llm_optimization(katib_client, exp_name_llm_optimization, exp_namespace)
         logging.info("---------------------------------------------------------------")
         logging.info(f"E2E is succeeded for Experiment created by tune: {exp_namespace}/{exp_name_llm_optimization}")
     except Exception as e: