Fix deepspeed examples bugs (Azure#2044)

* add timeout for deepspeed jobs * re format readme with black * change timeout length * change dockerfile to use acpt image * add training custom env * fix hostfile bug * fix bash generation * address comments * increase number of gpus being used * make sure deepspeed is upgraded to latest version * write to hostfile in single process
vrxmike · Feb 6, 2023 · 160461b · 160461b
1 parent e9c6241
commit 160461b
Show file tree

Hide file tree

Showing 8 changed files with 61 additions and 80 deletions.
diff --git a/cli/jobs/deepspeed/deepspeed-autotuning/docker-context/Dockerfile b/cli/jobs/deepspeed/deepspeed-autotuning/docker-context/Dockerfile
@@ -1,39 +1,3 @@
-FROM mcr.microsoft.com/azureml/aifx/stable-ubuntu2004-cu113-py38-torch1110:biweekly.202301.1
-RUN pip install git+https://github.com/microsoft/DeepSpeed.git@master
-
-# Install pip dependencies
-RUN pip install 'ipykernel~=6.0' \
-                'azureml-core==1.48.0' \
-				'azureml-dataset-runtime==1.48.0' \
-                'azureml-defaults==1.48.0' \
-				'azure-ml==0.0.1' \
-				'azure-ml-component==0.9.16.post2' \
-                'azureml-mlflow==1.48.0' \
-                'azureml-telemetry==1.48.0' \
-		        'azureml-contrib-services==1.48.0' \
-                'torch-tb-profiler~=0.4.0' \
-				'py-spy==0.3.12' \
-		        'debugpy~=1.6.3'
-
-RUN pip install \
-	azure-ai-ml==1.2.0 \
-        azureml-inference-server-http~=0.7.0 \
-        inference-schema~=1.4.2.1 \
-        MarkupSafe==2.0.1 \
-	    regex \
-	    pybind11
-
-# Inference requirements
-COPY --from=mcr.microsoft.com/azureml/o16n-base/python-assets:20220607.v1 /artifacts /var/
-RUN /var/requirements/install_system_requirements.sh && \
-    cp /var/configuration/rsyslog.conf /etc/rsyslog.conf && \
-    cp /var/configuration/nginx.conf /etc/nginx/sites-available/app && \
-    ln -sf /etc/nginx/sites-available/app /etc/nginx/sites-enabled/app && \
-    rm -f /etc/nginx/sites-enabled/default
-ENV SVDIR=/var/runit
-ENV WORKER_TIMEOUT=400
-EXPOSE 5001 8883 8888
-
-# support Deepspeed launcher requirement of passwordless ssh login
-RUN apt-get update
-RUN apt-get install -y openssh-server openssh-client
+FROM mcr.microsoft.com/azureml/curated/acpt-pytorch-1.11-py38-cuda11.3-gpu:latest
+# Need latest deepspeed version
+RUN pip install deepspeed -U
diff --git a/cli/jobs/deepspeed/deepspeed-autotuning/generate-yml.sh b/cli/jobs/deepspeed/deepspeed-autotuning/generate-yml.sh
@@ -2,13 +2,15 @@
 # Generate key
 ssh-keygen -t rsa -f './src/generated-key' -N ''
 
-# Generate yaml file with key path
+# Pre-set num_gpus_per_node so it can be passed into deepspeed via bash script.
+num_gpus_per_node=8
+
 cat > job.yml << EOF
 # Training job submission via AML CLI v2
 
 \$schema: https://azuremlschemas.azureedge.net/latest/commandJob.schema.json
 
-command: bash start-deepspeed.sh --autotuning tune --force_multi train.py --with_aml_log=True --deepspeed --deepspeed_config ds_config.json
+command: bash start-deepspeed.sh ${num_gpus_per_node} --autotuning tune --force_multi train.py --with_aml_log=True --deepspeed --deepspeed_config ds_config.json
 
 experiment_name: DistributedJob-DeepsSpeed-Autotuning-cifar
 display_name: deepspeed-autotuning-example
@@ -20,6 +22,8 @@ environment_variables:
   AZUREML_COMPUTE_USE_COMMON_RUNTIME: 'True'
   AZUREML_COMMON_RUNTIME_USE_INTERACTIVE_CAPABILITY: 'True'
   AZUREML_SSH_KEY: 'generated-key'
+limits:
+  timeout: 1800
 outputs:
   output:
     type: uri_folder
@@ -28,7 +32,7 @@ outputs:
 compute: azureml:gpu-v100-cluster
 distribution:
   type: pytorch
-  process_count_per_instance: 1
+  process_count_per_instance: ${num_gpus_per_node}
 resources:
   instance_count: 2
 EOF

diff --git a/cli/jobs/deepspeed/deepspeed-autotuning/src/start-deepspeed.sh b/cli/jobs/deepspeed/deepspeed-autotuning/src/start-deepspeed.sh
@@ -1,6 +1,5 @@
 #!/bin/bash
 az_batch_host_list="$AZ_BATCH_HOST_LIST"
-local_gpu_count=$((AZ_BATCHAI_GPU_COUNT / AZUREML_NODE_COUNT))
 RANK="$AZUREML_CR_NODE_RANK"
 
 # Start ssh
@@ -19,31 +18,36 @@ touch /root/.ssh/authorized_keys && chmod 600 /root/.ssh/authorized_keys
 cat /root/.ssh/id_rsa.pub >> /root/.ssh/authorized_keys
 /usr/sbin/sshd -D -p 1143 &
 
-# Create hostfile
+# Create hostfile. Use num_gpus_per_node to populate slots value.
 oldIFS=IFS
 IFS=',' read -ra host_list <<< "$az_batch_host_list"
-sudo mkdir /job
-for i in "${host_list[@]}"
-do 
-    echo "$i" slots="$local_gpu_count" >> /job/hostfile
-    echo "$i" slots="$local_gpu_count" >> /job/hostfile.txt
-done
 IFS=$oldIFS
 
+sudo mkdir /job
+if [[ $AZUREML_PROCESS_NAME == "rank_0" ]]
+then
+    for i in "${host_list[@]}"
+    do
+        echo "$i" slots=$1 >> /job/hostfile
+        echo "$i" slots=$1 >> /job/hostfile.txt
+    done
+fi
+
 echo Hostfile generated
 echo ------------
-cat /job/hostfile.txt
+cat /job/hostfile
 echo ------------
 
 # Create deepspeed call
 ds_call="deepspeed --hostfile /job/hostfile "
+shift
 for i in "$@"
 do
     ds_call+=$i
     ds_call+=" "
 done
 ls
-if [ $RANK == 0 ]
+if [[ $RANK == 0 ]] && [[ $AZUREML_PROCESS_NAME == "rank_0" ]]
 then
     echo rank is 0, starting deepspeed
     sleep 60

diff --git a/cli/jobs/deepspeed/deepspeed-autotuning/src/train.py b/cli/jobs/deepspeed/deepspeed-autotuning/src/train.py
@@ -56,8 +56,9 @@ def add_argument():
 
 # Need args here to set ranks for multi-node training with download=True
 args = add_argument()
-tracking_uri = mlflow.get_tracking_uri()
-print("Current tracking uri: {}".format(tracking_uri))
+if args.with_aml_log:
+    tracking_uri = mlflow.get_tracking_uri()
+    print("Current tracking uri: {}".format(tracking_uri))
 ########################################################################
 # The output of torchvision datasets are PILImage images of range [0, 1].
 # We transform them to Tensors of normalized range [-1, 1].
@@ -130,10 +131,7 @@ def add_argument():
 #
 # Showcasing logging metrics to automl.
 if args.with_aml_log:
-    this_run = mlflow.active_run()
-    if this_run:
-        print("Active run_id: {}".format(this_run.info.run_id))
-        mlflow.log_metrics({"hello": 12345})
+    mlflow.log_metrics({"hello": 12345})
 # This is when things start to get interesting.
 # We simply have to loop over our data iterator, and feed the inputs to the
 # network and optimize.
@@ -161,7 +159,7 @@ def add_argument():
         # if i % 2000 == 1999:  # print every 2000 mini-batches
         loss = running_loss / 2000
         print("[%d, %5d] loss: %.3f" % (epoch + 1, i + 1, loss))
-        if args.with_aml_log and mlflow.active_run():
+        if args.with_aml_log:
             try:
                 mlflow.log_metrics({"loss": loss})
             except NameError:

diff --git a/cli/jobs/deepspeed/deepspeed-training/docker-context/Dockerfile b/cli/jobs/deepspeed/deepspeed-training/docker-context/Dockerfile
@@ -0,0 +1,3 @@
+FROM mcr.microsoft.com/azureml/curated/acpt-pytorch-1.11-py38-cuda11.3-gpu:latest
+# Need latest deepspeed version
+RUN pip install deepspeed -U
diff --git a/cli/jobs/deepspeed/deepspeed-training/generate-yml.sh b/cli/jobs/deepspeed/deepspeed-training/generate-yml.sh
@@ -2,22 +2,28 @@
 # Generate key
 ssh-keygen -t rsa -f './src/generated-key' -N ''
 
-# Generate yaml file with key path
+# Pre-set num_gpus_per_node so it can be passed into deepspeed via bash script.
+num_gpus_per_node=8
+
 cat > job.yml << EOF
 # Training job submission via AML CLI v2
 
 \$schema: https://azuremlschemas.azureedge.net/latest/commandJob.schema.json
 
-command: bash start-deepspeed.sh --force_multi train.py --with_aml_log=True --deepspeed --deepspeed_config ds_config.json
+command: bash start-deepspeed.sh ${num_gpus_per_node} --force_multi train.py --with_aml_log=True --deepspeed --deepspeed_config ds_config.json
 
 experiment_name: DistributedJob-DeepsSpeed-Training-cifar
 display_name: deepspeed-training-example
 code: src
-environment: azureml:AzureML-ACPT-pytorch-1.11-py38-cuda11.3-gpu@latest
+environment:
+  build:
+    path: docker-context
 environment_variables:
   AZUREML_COMPUTE_USE_COMMON_RUNTIME: 'True'
   AZUREML_COMMON_RUNTIME_USE_INTERACTIVE_CAPABILITY: 'True'
   AZUREML_SSH_KEY: 'generated-key'
+limits:
+  timeout: 900
 outputs:
   output:
     type: uri_folder
@@ -26,7 +32,7 @@ outputs:
 compute: azureml:gpu-v100-cluster
 distribution:
   type: pytorch
-  process_count_per_instance: 1
+  process_count_per_instance: ${num_gpus_per_node}
 resources:
   instance_count: 2
 EOF

diff --git a/cli/jobs/deepspeed/deepspeed-training/src/start-deepspeed.sh b/cli/jobs/deepspeed/deepspeed-training/src/start-deepspeed.sh
@@ -1,6 +1,5 @@
 #!/bin/bash
 az_batch_host_list="$AZ_BATCH_HOST_LIST"
-local_gpu_count=$((AZ_BATCHAI_GPU_COUNT / AZUREML_NODE_COUNT))
 RANK="$AZUREML_CR_NODE_RANK"
 
 # Start ssh
@@ -19,31 +18,36 @@ touch /root/.ssh/authorized_keys && chmod 600 /root/.ssh/authorized_keys
 cat /root/.ssh/id_rsa.pub >> /root/.ssh/authorized_keys
 /usr/sbin/sshd -D -p 1143 &
 
-# Create hostfile
+# Create hostfile. Use num_gpus_per_node to populate the slots variable.
 oldIFS=IFS
 IFS=',' read -ra host_list <<< "$az_batch_host_list"
-sudo mkdir /job
-for i in "${host_list[@]}"
-do 
-    echo "$i" slots="$local_gpu_count" >> /job/hostfile
-    echo "$i" slots="$local_gpu_count" >> /job/hostfile.txt
-done
 IFS=$oldIFS
 
+sudo mkdir /job
+if [[ $AZUREML_PROCESS_NAME == "rank_0" ]]
+then
+    for i in "${host_list[@]}"
+    do
+        echo "$i" slots=$1 >> /job/hostfile
+        echo "$i" slots=$1 >> /job/hostfile.txt
+    done
+fi
+
 echo Hostfile generated
 echo ------------
-cat /job/hostfile.txt
+cat /job/hostfile
 echo ------------
 
 # Create deepspeed call
 ds_call="deepspeed --hostfile /job/hostfile "
+shift
 for i in "$@"
 do
     ds_call+=$i
     ds_call+=" "
 done
 ls
-if [ $RANK == 0 ]
+if [[ $RANK == 0 ]] && [[ $AZUREML_PROCESS_NAME == "rank_0" ]]
 then
     echo rank is 0, starting deepspeed
     sleep 60

diff --git a/cli/jobs/deepspeed/deepspeed-training/src/train.py b/cli/jobs/deepspeed/deepspeed-training/src/train.py
@@ -56,8 +56,9 @@ def add_argument():
 
 # Need args here to set ranks for multi-node training with download=True
 args = add_argument()
-tracking_uri = mlflow.get_tracking_uri()
-print("Current tracking uri: {}".format(tracking_uri))
+if args.with_aml_log:
+    tracking_uri = mlflow.get_tracking_uri()
+    print("Current tracking uri: {}".format(tracking_uri))
 ########################################################################
 # The output of torchvision datasets are PILImage images of range [0, 1].
 # We transform them to Tensors of normalized range [-1, 1].
@@ -130,10 +131,7 @@ def add_argument():
 #
 # Showcasing logging metrics to automl.
 if args.with_aml_log:
-    this_run = mlflow.active_run()
-    if this_run:
-        print("Active run_id: {}".format(this_run.info.run_id))
-        mlflow.log_metrics({"hello": 12345})
+    mlflow.log_metrics({"hello": 12345})
 # This is when things start to get interesting.
 # We simply have to loop over our data iterator, and feed the inputs to the
 # network and optimize.
@@ -161,7 +159,7 @@ def add_argument():
         # if i % 2000 == 1999:  # print every 2000 mini-batches
         loss = running_loss / 2000
         print("[%d, %5d] loss: %.3f" % (epoch + 1, i + 1, loss))
-        if args.with_aml_log and mlflow.active_run():
+        if args.with_aml_log:
             try:
                 mlflow.log_metrics({"loss": loss})
             except NameError: