Skip to content

Commit

Permalink
Fix deepspeed examples bugs (Azure#2044)
Browse files Browse the repository at this point in the history
* add timeout for deepspeed jobs

* re format readme with black

* change timeout length

* change dockerfile to use acpt image

* add training custom env

* fix hostfile bug

* fix bash generation

* address comments

* increase number of gpus being used

* make sure deepspeed is upgraded to latest version

* write to hostfile in single process
  • Loading branch information
cassieesvelt authored Feb 6, 2023
1 parent e9c6241 commit 160461b
Show file tree
Hide file tree
Showing 8 changed files with 61 additions and 80 deletions.
42 changes: 3 additions & 39 deletions cli/jobs/deepspeed/deepspeed-autotuning/docker-context/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,39 +1,3 @@
FROM mcr.microsoft.com/azureml/aifx/stable-ubuntu2004-cu113-py38-torch1110:biweekly.202301.1
RUN pip install git+https://github.com/microsoft/DeepSpeed.git@master

# Install pip dependencies
RUN pip install 'ipykernel~=6.0' \
'azureml-core==1.48.0' \
'azureml-dataset-runtime==1.48.0' \
'azureml-defaults==1.48.0' \
'azure-ml==0.0.1' \
'azure-ml-component==0.9.16.post2' \
'azureml-mlflow==1.48.0' \
'azureml-telemetry==1.48.0' \
'azureml-contrib-services==1.48.0' \
'torch-tb-profiler~=0.4.0' \
'py-spy==0.3.12' \
'debugpy~=1.6.3'

RUN pip install \
azure-ai-ml==1.2.0 \
azureml-inference-server-http~=0.7.0 \
inference-schema~=1.4.2.1 \
MarkupSafe==2.0.1 \
regex \
pybind11

# Inference requirements
COPY --from=mcr.microsoft.com/azureml/o16n-base/python-assets:20220607.v1 /artifacts /var/
RUN /var/requirements/install_system_requirements.sh && \
cp /var/configuration/rsyslog.conf /etc/rsyslog.conf && \
cp /var/configuration/nginx.conf /etc/nginx/sites-available/app && \
ln -sf /etc/nginx/sites-available/app /etc/nginx/sites-enabled/app && \
rm -f /etc/nginx/sites-enabled/default
ENV SVDIR=/var/runit
ENV WORKER_TIMEOUT=400
EXPOSE 5001 8883 8888

# support Deepspeed launcher requirement of passwordless ssh login
RUN apt-get update
RUN apt-get install -y openssh-server openssh-client
FROM mcr.microsoft.com/azureml/curated/acpt-pytorch-1.11-py38-cuda11.3-gpu:latest
# Need latest deepspeed version
RUN pip install deepspeed -U
10 changes: 7 additions & 3 deletions cli/jobs/deepspeed/deepspeed-autotuning/generate-yml.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,15 @@
# Generate key
ssh-keygen -t rsa -f './src/generated-key' -N ''

# Generate yaml file with key path
# Pre-set num_gpus_per_node so it can be passed into deepspeed via bash script.
num_gpus_per_node=8

cat > job.yml << EOF
# Training job submission via AML CLI v2
\$schema: https://azuremlschemas.azureedge.net/latest/commandJob.schema.json
command: bash start-deepspeed.sh --autotuning tune --force_multi train.py --with_aml_log=True --deepspeed --deepspeed_config ds_config.json
command: bash start-deepspeed.sh ${num_gpus_per_node} --autotuning tune --force_multi train.py --with_aml_log=True --deepspeed --deepspeed_config ds_config.json
experiment_name: DistributedJob-DeepsSpeed-Autotuning-cifar
display_name: deepspeed-autotuning-example
Expand All @@ -20,6 +22,8 @@ environment_variables:
AZUREML_COMPUTE_USE_COMMON_RUNTIME: 'True'
AZUREML_COMMON_RUNTIME_USE_INTERACTIVE_CAPABILITY: 'True'
AZUREML_SSH_KEY: 'generated-key'
limits:
timeout: 1800
outputs:
output:
type: uri_folder
Expand All @@ -28,7 +32,7 @@ outputs:
compute: azureml:gpu-v100-cluster
distribution:
type: pytorch
process_count_per_instance: 1
process_count_per_instance: ${num_gpus_per_node}
resources:
instance_count: 2
EOF
Expand Down
24 changes: 14 additions & 10 deletions cli/jobs/deepspeed/deepspeed-autotuning/src/start-deepspeed.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
#!/bin/bash
az_batch_host_list="$AZ_BATCH_HOST_LIST"
local_gpu_count=$((AZ_BATCHAI_GPU_COUNT / AZUREML_NODE_COUNT))
RANK="$AZUREML_CR_NODE_RANK"

# Start ssh
Expand All @@ -19,31 +18,36 @@ touch /root/.ssh/authorized_keys && chmod 600 /root/.ssh/authorized_keys
cat /root/.ssh/id_rsa.pub >> /root/.ssh/authorized_keys
/usr/sbin/sshd -D -p 1143 &

# Create hostfile
# Create hostfile. Use num_gpus_per_node to populate slots value.
oldIFS=IFS
IFS=',' read -ra host_list <<< "$az_batch_host_list"
sudo mkdir /job
for i in "${host_list[@]}"
do
echo "$i" slots="$local_gpu_count" >> /job/hostfile
echo "$i" slots="$local_gpu_count" >> /job/hostfile.txt
done
IFS=$oldIFS

sudo mkdir /job
if [[ $AZUREML_PROCESS_NAME == "rank_0" ]]
then
for i in "${host_list[@]}"
do
echo "$i" slots=$1 >> /job/hostfile
echo "$i" slots=$1 >> /job/hostfile.txt
done
fi

echo Hostfile generated
echo ------------
cat /job/hostfile.txt
cat /job/hostfile
echo ------------

# Create deepspeed call
ds_call="deepspeed --hostfile /job/hostfile "
shift
for i in "$@"
do
ds_call+=$i
ds_call+=" "
done
ls
if [ $RANK == 0 ]
if [[ $RANK == 0 ]] && [[ $AZUREML_PROCESS_NAME == "rank_0" ]]
then
echo rank is 0, starting deepspeed
sleep 60
Expand Down
12 changes: 5 additions & 7 deletions cli/jobs/deepspeed/deepspeed-autotuning/src/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,9 @@ def add_argument():

# Need args here to set ranks for multi-node training with download=True
args = add_argument()
tracking_uri = mlflow.get_tracking_uri()
print("Current tracking uri: {}".format(tracking_uri))
if args.with_aml_log:
tracking_uri = mlflow.get_tracking_uri()
print("Current tracking uri: {}".format(tracking_uri))
########################################################################
# The output of torchvision datasets are PILImage images of range [0, 1].
# We transform them to Tensors of normalized range [-1, 1].
Expand Down Expand Up @@ -130,10 +131,7 @@ def add_argument():
#
# Showcasing logging metrics to automl.
if args.with_aml_log:
this_run = mlflow.active_run()
if this_run:
print("Active run_id: {}".format(this_run.info.run_id))
mlflow.log_metrics({"hello": 12345})
mlflow.log_metrics({"hello": 12345})
# This is when things start to get interesting.
# We simply have to loop over our data iterator, and feed the inputs to the
# network and optimize.
Expand Down Expand Up @@ -161,7 +159,7 @@ def add_argument():
# if i % 2000 == 1999: # print every 2000 mini-batches
loss = running_loss / 2000
print("[%d, %5d] loss: %.3f" % (epoch + 1, i + 1, loss))
if args.with_aml_log and mlflow.active_run():
if args.with_aml_log:
try:
mlflow.log_metrics({"loss": loss})
except NameError:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
FROM mcr.microsoft.com/azureml/curated/acpt-pytorch-1.11-py38-cuda11.3-gpu:latest
# Need latest deepspeed version
RUN pip install deepspeed -U
14 changes: 10 additions & 4 deletions cli/jobs/deepspeed/deepspeed-training/generate-yml.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,22 +2,28 @@
# Generate key
ssh-keygen -t rsa -f './src/generated-key' -N ''

# Generate yaml file with key path
# Pre-set num_gpus_per_node so it can be passed into deepspeed via bash script.
num_gpus_per_node=8

cat > job.yml << EOF
# Training job submission via AML CLI v2
\$schema: https://azuremlschemas.azureedge.net/latest/commandJob.schema.json
command: bash start-deepspeed.sh --force_multi train.py --with_aml_log=True --deepspeed --deepspeed_config ds_config.json
command: bash start-deepspeed.sh ${num_gpus_per_node} --force_multi train.py --with_aml_log=True --deepspeed --deepspeed_config ds_config.json
experiment_name: DistributedJob-DeepsSpeed-Training-cifar
display_name: deepspeed-training-example
code: src
environment: azureml:AzureML-ACPT-pytorch-1.11-py38-cuda11.3-gpu@latest
environment:
build:
path: docker-context
environment_variables:
AZUREML_COMPUTE_USE_COMMON_RUNTIME: 'True'
AZUREML_COMMON_RUNTIME_USE_INTERACTIVE_CAPABILITY: 'True'
AZUREML_SSH_KEY: 'generated-key'
limits:
timeout: 900
outputs:
output:
type: uri_folder
Expand All @@ -26,7 +32,7 @@ outputs:
compute: azureml:gpu-v100-cluster
distribution:
type: pytorch
process_count_per_instance: 1
process_count_per_instance: ${num_gpus_per_node}
resources:
instance_count: 2
EOF
Expand Down
24 changes: 14 additions & 10 deletions cli/jobs/deepspeed/deepspeed-training/src/start-deepspeed.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
#!/bin/bash
az_batch_host_list="$AZ_BATCH_HOST_LIST"
local_gpu_count=$((AZ_BATCHAI_GPU_COUNT / AZUREML_NODE_COUNT))
RANK="$AZUREML_CR_NODE_RANK"

# Start ssh
Expand All @@ -19,31 +18,36 @@ touch /root/.ssh/authorized_keys && chmod 600 /root/.ssh/authorized_keys
cat /root/.ssh/id_rsa.pub >> /root/.ssh/authorized_keys
/usr/sbin/sshd -D -p 1143 &

# Create hostfile
# Create hostfile. Use num_gpus_per_node to populate the slots variable.
oldIFS=IFS
IFS=',' read -ra host_list <<< "$az_batch_host_list"
sudo mkdir /job
for i in "${host_list[@]}"
do
echo "$i" slots="$local_gpu_count" >> /job/hostfile
echo "$i" slots="$local_gpu_count" >> /job/hostfile.txt
done
IFS=$oldIFS

sudo mkdir /job
if [[ $AZUREML_PROCESS_NAME == "rank_0" ]]
then
for i in "${host_list[@]}"
do
echo "$i" slots=$1 >> /job/hostfile
echo "$i" slots=$1 >> /job/hostfile.txt
done
fi

echo Hostfile generated
echo ------------
cat /job/hostfile.txt
cat /job/hostfile
echo ------------

# Create deepspeed call
ds_call="deepspeed --hostfile /job/hostfile "
shift
for i in "$@"
do
ds_call+=$i
ds_call+=" "
done
ls
if [ $RANK == 0 ]
if [[ $RANK == 0 ]] && [[ $AZUREML_PROCESS_NAME == "rank_0" ]]
then
echo rank is 0, starting deepspeed
sleep 60
Expand Down
12 changes: 5 additions & 7 deletions cli/jobs/deepspeed/deepspeed-training/src/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,9 @@ def add_argument():

# Need args here to set ranks for multi-node training with download=True
args = add_argument()
tracking_uri = mlflow.get_tracking_uri()
print("Current tracking uri: {}".format(tracking_uri))
if args.with_aml_log:
tracking_uri = mlflow.get_tracking_uri()
print("Current tracking uri: {}".format(tracking_uri))
########################################################################
# The output of torchvision datasets are PILImage images of range [0, 1].
# We transform them to Tensors of normalized range [-1, 1].
Expand Down Expand Up @@ -130,10 +131,7 @@ def add_argument():
#
# Showcasing logging metrics to automl.
if args.with_aml_log:
this_run = mlflow.active_run()
if this_run:
print("Active run_id: {}".format(this_run.info.run_id))
mlflow.log_metrics({"hello": 12345})
mlflow.log_metrics({"hello": 12345})
# This is when things start to get interesting.
# We simply have to loop over our data iterator, and feed the inputs to the
# network and optimize.
Expand Down Expand Up @@ -161,7 +159,7 @@ def add_argument():
# if i % 2000 == 1999: # print every 2000 mini-batches
loss = running_loss / 2000
print("[%d, %5d] loss: %.3f" % (epoch + 1, i + 1, loss))
if args.with_aml_log and mlflow.active_run():
if args.with_aml_log:
try:
mlflow.log_metrics({"loss": loss})
except NameError:
Expand Down

0 comments on commit 160461b

Please sign in to comment.