Skip to content

Commit

Permalink
Set continue_on_step_failure setting to False for FT, Eval, Import no…
Browse files Browse the repository at this point in the history
…tebooks (Azure#2366)

* Disbable continue job on failure

* replicate for all notebooks

* Reformat

* Revert metadata
  • Loading branch information
skasturi authored Jun 11, 2023
1 parent 3dafee3 commit f922f0e
Show file tree
Hide file tree
Showing 12 changed files with 118 additions and 75 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,9 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"models = []\n",
Expand All @@ -231,21 +234,18 @@
" reg_model = list(registry_ml_client.models.list(name=model[\"name\"]))[0]\n",
" print(reg_model.id)\n",
" models.append({**model, \"version\": reg_model.version})"
],
"metadata": {
"collapsed": false
}
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"models"
],
"metadata": {
"collapsed": false
}
]
},
{
"attachments": {},
Expand All @@ -262,14 +262,14 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"%pip install transformers\n",
"%pip install torch"
],
"metadata": {
"collapsed": false
}
]
},
{
"cell_type": "code",
Expand Down Expand Up @@ -302,6 +302,9 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from transformers import AutoTokenizer\n",
Expand All @@ -319,10 +322,7 @@
" )\n",
" test_data_file_name = \"small-test-{}.jsonl\".format(model[\"name\"])\n",
" test_data_df.to_json(test_data_file_name, lines=True, orient=\"records\")"
],
"metadata": {
"collapsed": false
}
]
},
{
"attachments": {},
Expand Down Expand Up @@ -417,6 +417,11 @@
" # don't reuse cached results from previous jobs\n",
" pipeline_object.settings.force_rerun = True\n",
" pipeline_object.settings.default_compute = compute_cluster\n",
"\n",
" # set continue on step failure to False\n",
" pipeline_object.settings.continue_on_step_failure = False\n",
"\n",
" pipeline_object.display_name = f\"eval-{model['name']}-{timestamp}\"\n",
" pipeline_job = workspace_ml_client.jobs.create_or_update(\n",
" pipeline_object, experiment_name=experiment_name\n",
" )\n",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,9 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"models = []\n",
Expand All @@ -225,21 +228,18 @@
" reg_model = list(registry_ml_client.models.list(name=model[\"name\"]))[0]\n",
" print(reg_model.id)\n",
" models.append({**model, \"version\": reg_model.version})"
],
"metadata": {
"collapsed": false
}
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"models"
],
"metadata": {
"collapsed": false
}
]
},
{
"attachments": {},
Expand Down Expand Up @@ -406,6 +406,10 @@
" # don't reuse cached results from previous jobs\n",
" pipeline_object.settings.force_rerun = True\n",
" pipeline_object.settings.default_compute = compute_cluster\n",
"\n",
" # set continue on step failure to False\n",
" pipeline_object.settings.continue_on_step_failure = False\n",
"\n",
" pipeline_object.display_name = f\"eval-{model['name']}-{timestamp}\"\n",
" pipeline_job = workspace_ml_client.jobs.create_or_update(\n",
" pipeline_object, experiment_name=experiment_name\n",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,9 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"models = []\n",
Expand All @@ -218,21 +221,18 @@
" reg_model = list(registry_ml_client.models.list(name=model[\"name\"]))[0]\n",
" print(reg_model.id)\n",
" models.append({**model, \"version\": reg_model.version})"
],
"metadata": {
"collapsed": false
}
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"models"
],
"metadata": {
"collapsed": false
}
]
},
{
"attachments": {},
Expand Down Expand Up @@ -401,6 +401,10 @@
" # don't reuse cached results from previous jobs\n",
" pipeline_object.settings.force_rerun = True\n",
" pipeline_object.settings.default_compute = compute_cluster\n",
"\n",
" # set continue on step failure to False\n",
" pipeline_object.settings.continue_on_step_failure = False\n",
"\n",
" pipeline_object.display_name = f\"eval-{model['name']}-{timestamp}\"\n",
" pipeline_job = workspace_ml_client.jobs.create_or_update(\n",
" pipeline_object, experiment_name=experiment_name\n",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,9 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"models = []\n",
Expand All @@ -218,21 +221,18 @@
" reg_model = list(registry_ml_client.models.list(name=model[\"name\"]))[0]\n",
" print(reg_model.id)\n",
" models.append({**model, \"version\": reg_model.version})"
],
"metadata": {
"collapsed": false
}
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"models"
],
"metadata": {
"collapsed": false
}
]
},
{
"attachments": {},
Expand Down Expand Up @@ -399,6 +399,10 @@
" # don't reuse cached results from previous jobs\n",
" pipeline_object.settings.force_rerun = True\n",
" pipeline_object.settings.default_compute = compute_cluster\n",
"\n",
" # set continue on step failure to False\n",
" pipeline_object.settings.continue_on_step_failure = False\n",
"\n",
" pipeline_object.display_name = f\"eval-{model['name']}-{timestamp}\"\n",
" pipeline_job = workspace_ml_client.jobs.create_or_update(\n",
" pipeline_object, experiment_name=experiment_name\n",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,9 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"models = []\n",
Expand All @@ -217,21 +220,18 @@
" reg_model = list(registry_ml_client.models.list(name=model[\"name\"]))[0]\n",
" print(reg_model.id)\n",
" models.append({**model, \"version\": reg_model.version})"
],
"metadata": {
"collapsed": false
}
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"models"
],
"metadata": {
"collapsed": false
}
]
},
{
"attachments": {},
Expand Down Expand Up @@ -431,6 +431,10 @@
" # don't reuse cached results from previous jobs\n",
" pipeline_object.settings.force_rerun = True\n",
" pipeline_object.settings.default_compute = compute_cluster\n",
"\n",
" # set continue on step failure to False\n",
" pipeline_object.settings.continue_on_step_failure = False\n",
"\n",
" pipeline_object.display_name = f\"eval-{model['name']}-{timestamp}\"\n",
" pipeline_job = workspace_ml_client.jobs.create_or_update(\n",
" pipeline_object, experiment_name=experiment_name\n",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,9 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"models = []\n",
Expand All @@ -219,21 +222,18 @@
" reg_model = list(registry_ml_client.models.list(name=model[\"name\"]))[0]\n",
" print(reg_model.id)\n",
" models.append({**model, \"version\": reg_model.version})"
],
"metadata": {
"collapsed": false
}
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"models"
],
"metadata": {
"collapsed": false
}
]
},
{
"attachments": {},
Expand Down Expand Up @@ -401,6 +401,10 @@
" # don't reuse cached results from previous jobs\n",
" pipeline_object.settings.force_rerun = True\n",
" pipeline_object.settings.default_compute = compute_cluster\n",
"\n",
" # set continue on step failure to False\n",
" pipeline_object.settings.continue_on_step_failure = False\n",
"\n",
" pipeline_object.display_name = f\"eval-{model['name']}-{timestamp}\"\n",
" pipeline_job = workspace_ml_client.jobs.create_or_update(\n",
" pipeline_object, experiment_name=experiment_name\n",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -319,7 +319,10 @@
"pipeline_object = create_pipeline()\n",
"\n",
"# don't use cached results from previous jobs\n",
"pipeline_object.settings.force_rerun = True"
"pipeline_object.settings.force_rerun = True\n",
"\n",
"# set continue on step failure to False\n",
"pipeline_object.settings.continue_on_step_failure = False"
]
},
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -311,7 +311,10 @@
"pipeline_object = create_pipeline()\n",
"\n",
"# don't use cached results from previous jobs\n",
"pipeline_object.settings.force_rerun = True"
"pipeline_object.settings.force_rerun = True\n",
"\n",
"# set continue on step failure to False\n",
"pipeline_object.settings.continue_on_step_failure = False"
]
},
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@
" )\n",
" gpus_per_node = 1\n",
"\n",
"# genrating a unique timestamp that can be used for names and versions that need to be unique\n",
"# generating a unique timestamp that can be used for names and versions that need to be unique\n",
"timestamp = str(int(time.time()))"
]
},
Expand Down Expand Up @@ -338,7 +338,10 @@
"pipeline_object = create_pipeline()\n",
"\n",
"# don't use cached results from previous jobs\n",
"pipeline_object.settings.force_rerun = True"
"pipeline_object.settings.force_rerun = True\n",
"\n",
"# set continue on step failure to False\n",
"pipeline_object.settings.continue_on_step_failure = False"
]
},
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -314,7 +314,10 @@
"pipeline_object = create_pipeline()\n",
"\n",
"# don't use cached results from previous jobs\n",
"pipeline_object.settings.force_rerun = True"
"pipeline_object.settings.force_rerun = True\n",
"\n",
"# set continue on step failure to False\n",
"pipeline_object.settings.continue_on_step_failure = False"
]
},
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -306,7 +306,10 @@
"pipeline_object = create_pipeline()\n",
"\n",
"# don't use cached results from previous jobs\n",
"pipeline_object.settings.force_rerun = True"
"pipeline_object.settings.force_rerun = True\n",
"\n",
"# set continue on step failure to False\n",
"pipeline_object.settings.continue_on_step_failure = False"
]
},
{
Expand Down
Loading

0 comments on commit f922f0e

Please sign in to comment.