Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[GSoC] Add unit tests for tune API #2423

Open
wants to merge 22 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 26 additions & 8 deletions sdk/python/v1beta1/kubeflow/katib/api/katib_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -415,7 +415,10 @@ class name in this argument.
experiment.spec.max_failed_trial_count = max_failed_trial_count

# If users choose to use a custom objective function.
if objective is not None:
if objective is not None or parameters is not None:
if objective is None or base_image is None or parameters is None:
raise ValueError("One of the required parameters is None")

helenxie-bit marked this conversation as resolved.
Show resolved Hide resolved
# Add metrics collector to the Katib Experiment.
# Up to now, we only support parameter `kind`, of which default value
# is `StdOut`, to specify the kind of metrics collector.
Expand Down Expand Up @@ -504,9 +507,9 @@ class name in this argument.
# If users choose to use external models and datasets.
else:
if (
not model_provider_parameters
or not dataset_provider_parameters
or not trainer_parameters
model_provider_parameters is None
or dataset_provider_parameters is None
or trainer_parameters is None
):
raise ValueError("One of the required parameters is None")

Expand All @@ -518,6 +521,7 @@ class name in this argument.
from kubeflow.storage_initializer.hugging_face import (
HuggingFaceDatasetParams,
HuggingFaceModelParams,
HuggingFaceTrainerParams,
)
from kubeflow.storage_initializer.s3 import S3DatasetParams
from kubeflow.training import models as training_models
Expand Down Expand Up @@ -567,7 +571,7 @@ class name in this argument.
)
except Exception as e:
pvc_list = self.core_api.list_namespaced_persistent_volume_claim(
namespace
namespace=namespace
)
# Check if the PVC with the specified name exists.
for pvc in pvc_list.items:
Expand Down Expand Up @@ -596,6 +600,11 @@ class name in this argument.
"or HuggingFaceDatasetParams."
)

if not isinstance(trainer_parameters, HuggingFaceTrainerParams):
raise ValueError(
"Trainer parameters must be an instance of HuggingFaceTrainerParams."
)

# Iterate over input parameters and do substitutions.
experiment_params = []
trial_params = []
Expand Down Expand Up @@ -633,6 +642,8 @@ class name in this argument.
model_provider_parameters.model_uri,
"--transformer_type",
model_provider_parameters.transformer_type.__name__,
"--num_labels",
str(model_provider_parameters.num_labels),
"--model_dir",
VOLUME_PATH_MODEL,
"--dataset_dir",
Expand All @@ -643,7 +654,11 @@ class name in this argument.
f"'{training_args}'",
],
volume_mounts=[STORAGE_INITIALIZER_VOLUME_MOUNT],
resources=resources_per_trial.resources_per_worker,
resources=(
resources_per_trial.resources_per_worker
if resources_per_trial
else None
),
)

# Create the worker and the master pod.
Expand Down Expand Up @@ -677,7 +692,10 @@ class name in this argument.
),
)

if resources_per_trial.num_procs_per_worker:
if (
resources_per_trial is not None
and resources_per_trial.num_procs_per_worker
):
pytorchjob.spec.nproc_per_node = str(
resources_per_trial.num_procs_per_worker
)
Expand All @@ -689,7 +707,7 @@ class name in this argument.
)
)

if resources_per_trial.num_workers > 1:
if resources_per_trial is not None and resources_per_trial.num_workers > 1:
pytorchjob.spec.pytorch_replica_specs["Worker"] = (
training_models.KubeflowOrgV1ReplicaSpec(
replicas=resources_per_trial.num_workers - 1,
Expand Down
Loading