Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dev deepspeed on checkpointing #136

Open
wants to merge 9 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
268 changes: 189 additions & 79 deletions DeepSpeed/Megatron-LM/README.md

Large diffs are not rendered by default.

130 changes: 130 additions & 0 deletions DeepSpeed/Megatron-LM/extract_deepspeed_logs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
import os
import re
import sys
import glob
import json
import argparse
import pprint

import numpy as np

pp = pprint.PrettyPrinter(indent=1)
os.chdir(sys.path[0])

parser = argparse.ArgumentParser(description="flags for benchmark")
parser.add_argument("--log_dir", type=str, default="./logs/deepspeed/gpt2-small/bz8", required=True)
parser.add_argument("--output_dir", type=str, default="./result", required=False)
parser.add_argument('--warmup_batches', type=int, default=100)
parser.add_argument('--train_batches', type=int, default=200)
parser.add_argument('--batch_size_per_device', type=int, default=8)

args = parser.parse_args()


class AutoVivification(dict):
"""Implementation of perl's autovivification feature."""

def __getitem__(self, item):
try:
return dict.__getitem__(self, item)
except KeyError:
value = self[item] = type(self)()
return value


def extract_info_from_file(log_file, result_dict, speed_dict):
# extract info from file name
fname = os.path.basename(log_file)
run_case = log_file.split("/")[-2] # eg: 1n1g
model = fname.split("_")[0]
batch_size = int(fname.split("_")[1].strip("b"))
pricition = fname.split("_")[2]
test_iter = int(fname.split("_")[3].strip(".log"))
node_num = int(run_case[0])
if len(run_case) == 4:
card_num = int(run_case[-2])
elif len(run_case) == 5:
card_num = int(run_case[-3:-1])

total_batch_size = node_num * card_num * batch_size
tmp_dict = {
'average_speed': 0,
'batch_size_per_device': batch_size,
}

avg_speed_list = []
# extract info from file content
with open(log_file) as f:
lines = f.readlines()
for line in lines:
if "SamplesPerSec" in line:
p1 = re.compile(r"SamplesPerSec=(.*\.?.*)\n", re.S)
item = re.findall(p1, line)
a = float(item[0].strip())
avg_speed_list.append(round(a, 4))

# compute avg throughoutput
begin_index=args.warmup_batches-2
avg_speed = round(np.mean(avg_speed_list[begin_index:args.train_batches]), 2)
tmp_dict['average_speed'] = avg_speed

result_dict[model][run_case]['average_speed'] = tmp_dict['average_speed']
result_dict[model][run_case]['batch_size_per_device'] = tmp_dict['batch_size_per_device']

speed_dict[model][run_case][test_iter] = avg_speed

print(log_file, speed_dict[model][run_case])


def compute_median(iter_dict):
speed_list = [i for i in iter_dict.values()]
return round(np.median(speed_list), 2)


def compute_speedup(result_dict, speed_dict):
model_list = [key for key in result_dict] # eg.['vgg16', 'rn50']
for m in model_list:
run_case = [key for key in result_dict[m]] # eg.['4n8g', '2n8g', '1n8g', '1n4g', '1n1g']
for d in run_case:
speed_up = 1.0
if result_dict[m]['1n1g']['average_speed']:
result_dict[m][d]['average_speed'] = compute_average(speed_dict[m][d])
result_dict[m][d]['median_speed'] = compute_median(speed_dict[m][d])
speed_up = result_dict[m][d]['median_speed'] / compute_median(speed_dict[m]['1n1g'])
result_dict[m][d]['speedup'] = round(speed_up, 2)


def compute_average(iter_dict):
i = 0
total_speed = 0
for iter in iter_dict:
i += 1
total_speed += iter_dict[iter]
return round(total_speed / i, 2)


def extract_result():
result_dict = AutoVivification()
speed_dict = AutoVivification()
logs_list = glob.glob(os.path.join(args.log_dir, "*/*.log"))
for l in logs_list:
extract_info_from_file(l, result_dict, speed_dict)

# compute speedup
compute_speedup(result_dict, speed_dict)

# print result
pp.pprint(result_dict)

# write to file as JSON format
os.makedirs(args.output_dir, exist_ok=True)
framwork = args.log_dir.split('/')[-1]
result_file_name = os.path.join(args.output_dir, framwork + "_result.json")
print("Saving result to {}".format(result_file_name))
with open(result_file_name, 'w') as f:
json.dump(result_dict, f)


if __name__ == "__main__":
extract_result()

20 changes: 20 additions & 0 deletions DeepSpeed/Megatron-LM/scripts/run_multi_node.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#!/usr/bin/bash
SHELL_FOLDER=$(dirname $(readlink -f "$0"))
MODEL=${1:-gpt2-small}
BATCH_SIZE_PER_DEVICE=${2:-8}
ZERO_STAGE=${3:-2}
CHECKPOINT_ACTIVATIONS=${4:-"on"}
DTYPE=${5:-'fp16'}
TEST_NUM=${6:-5}


i=1
while [ $i -le ${TEST_NUM} ]
do
bash $SHELL_FOLDER/runner.sh $MODEL $BATCH_SIZE_PER_DEVICE 4 8 $ZERO_STAGE $CHECKPOINT_ACTIVATIONS $DTYPE ${i}
echo " >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Finished Test Case ${i}!<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< "
let i++
pkill python3
sleep 30s
done

42 changes: 42 additions & 0 deletions DeepSpeed/Megatron-LM/scripts/run_single_node.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#!/usr/bin/bash
SHELL_FOLDER=$(dirname $(readlink -f "$0"))
MODEL=${1:-gpt2-small}
BATCH_SIZE_PER_DEVICE=${2:-8}
ZERO_STAGE=${3:-2}
CHECKPOINT_ACTIVATIONS=${4:-"on"}
DTYPE=${5:-'fp16'}
TEST_NUM=${6:-5}



i=1
while [ $i -le ${TEST_NUM} ]
do
bash $SHELL_FOLDER/runner.sh $MODEL $BATCH_SIZE_PER_DEVICE 1 1 $ZERO_STAGE $CHECKPOINT_ACTIVATIONS $DTYPE ${i}
echo " >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Finished Test Case ${i}!<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< "
let i++
pkill python3
sleep 30s
done

i=1
while [ $i -le ${TEST_NUM} ]
do
bash $SHELL_FOLDER/runner.sh $MODEL $BATCH_SIZE_PER_DEVICE 1 4 $ZERO_STAGE $CHECKPOINT_ACTIVATIONS $DTYPE ${i}
echo " >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Finished Test Case ${i}!<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< "
let i++
pkill python3
sleep 30s
done

i=1
while [ $i -le ${TEST_NUM} ]
do
bash $SHELL_FOLDER/runner.sh $MODEL $BATCH_SIZE_PER_DEVICE 1 8 $ZERO_STAGE $CHECKPOINT_ACTIVATIONS $DTYPE ${i}
echo " >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Finished Test Case ${i}!<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< "
let i++
pkill python3
sleep 30s
done


24 changes: 24 additions & 0 deletions DeepSpeed/Megatron-LM/scripts/run_two_node.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#!/usr/bin/bash
SHELL_FOLDER=$(dirname $(readlink -f "$0"))
MODEL=${1:-gpt2-small}
BATCH_SIZE_PER_DEVICE=${2:-8}
ZERO_STAGE=${3:-2}
CHECKPOINT_ACTIVATIONS=${4:-"on"}
DTYPE=${5:-'fp16'}
TEST_NUM=${6:-5}

# export NODE1=10.11.0.2
# export NODE2=10.11.0.3


i=1
while [ $i -le ${TEST_NUM} ]
do
bash $SHELL_FOLDER/runner.sh $MODEL $BATCH_SIZE_PER_DEVICE 2 8 $ZERO_STAGE $CHECKPOINT_ACTIVATIONS $DTYPE ${i}
echo " >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Finished Test Case ${i}!<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< "
let i++
pkill python3
sleep 30s
done


Original file line number Diff line number Diff line change
@@ -1,23 +1,22 @@
#! /bin/bash

# Change for multinode config
BATCH_SIZE=${1:-4}
NUM_GPUS_PER_WORKER=${2:-8}
ZERO_STAGE=${3:-0}
CHECKPOINT_ACTIVATIONS=${4:-"off"}
NUM_WORKERS=${5:-1}
MP_SIZE=${6:-1}
ITER_NUM=${7:-1000}
MODEL=${1:-gpt2-small}
BATCH_SIZE_PER_DEVICE=${2:-8}
NUM_WORKERS=${3:-1}
NUM_GPUS_PER_WORKER=${4:-8}
ZERO_STAGE=${5:-2}
CHECKPOINT_ACTIVATIONS=${6:-"on"}
DTYPE=${7:-'fp16'}
TEST_NUM=${8:-1}
ITER_NUM=${9:-200}
MP_SIZE=${10:-1}

script_path=$(realpath $0)
script_dir=$(dirname $script_path)

echo "BATCH_SIZE: ${BATCH_SIZE}, NUM_GPUS_PER_WORKER:${NUM_GPUS_PER_WORKER}, ZERO_STAGE:${ZERO_STAGE}, CHECKPOINT_ACTIVATIONS:${CHECKPOINT_ACTIVATIONS} "

a=`expr ${#GPUS} + 1`
gpu_num_per_node=`expr ${a} / 2`
gpu_num=`expr ${NUM_GPUS_PER_WORKER} \* ${NUM_WORKERS}`
total_bz=`expr ${BATCH_SIZE} \* ${gpu_num}`
total_bz=`expr ${BATCH_SIZE_PER_DEVICE} \* ${gpu_num}`

sed -i "s/\"train_batch_size\":.*$/\"train_batch_size\": $total_bz,/" $script_dir/ds_zero2_config.json
if [ ${CHECKPOINT_ACTIVATIONS} == "on" ];then
Expand All @@ -27,33 +26,34 @@ else
fi
sed -i "s/\"stage\":.*$/\"stage\": $ZERO_STAGE/" $script_dir/ds_zero2_config.json

# gpt2-small
num_layers=12
num_attention_heads=12
hidden_size=768

# # gpt2-medium
# num_layers=24
# num_attention_heads=16
# hidden_size=1024

if [ ${MODEL} == "gpt2-small" ];then
echo "Using network >> gpt2-small"
num_layers=12
num_attention_heads=12
hidden_size=768
elif [ ${MODEL} == "gpt2-medium" ];then
echo "Using network >> gpt2-medium"
num_layers=24
num_attention_heads=16
hidden_size=1024
fi

PREFIX=20201209-test_zero_gpt2-small
rm -rf checkpoints
LOG_FOLDER=./logs
PREFIX=logs-20210414-stage${ZERO_STAGE}-${CHECKPOINT_ACTIVATIONS}-activation
rm -rf test-checkpoints
LOG_FOLDER=./${PREFIX}/deepspeed/${MODEL}/bz${BATCH_SIZE_PER_DEVICE}/${NUM_WORKERS}n${NUM_GPUS_PER_WORKER}g
mkdir -p $LOG_FOLDER
LOG=${LOG_FOLDER}/${PREFIX}_${NUM_WORKERS}n${NUM_GPUS_PER_WORKER}g_bz${BATCH_SIZE}_zero_stage${ZERO_STAGE}_${CHECKPOINT_ACTIVATIONS}_checkpoint_activation.log

LOG=${LOG_FOLDER}/${MODEL}_b${BATCH_SIZE_PER_DEVICE}_fp16_${TEST_NUM}.log


config_json="$script_dir/ds_zero2_config.json"
gpt_options=" \
--save $PREFIX_checkpoint_${NUM_WORKERS}n${NUM_GPUS_PER_WORKER}g_bz${BATCH_SIZE}_zero_stage${ZERO_STAGE}_${CHECKPOINT_ACTIVATIONS}_checkpoint_activation \
--save test-checkpoints \
--model-parallel-size ${MP_SIZE} \
--num-layers ${num_layers} \
--hidden-size ${hidden_size} \
--num-attention-heads ${num_attention_heads} \
--batch-size ${BATCH_SIZE} \
--batch-size ${BATCH_SIZE_PER_DEVICE} \
--seq-length 1024 \
--max-position-embeddings 1024 \
--train-iters ${ITER_NUM} \
Expand All @@ -69,9 +69,16 @@ gpt_options=" \
--weight-decay 1e-2 \
--clip-grad 1.0 \
--warmup .01 \
--fp16 \
"

if [ "$DTYPE" = "fp16" ] ; then
echo "Using data type >> fp16"
gpt_options="${gpt_options} --fp16 "
else
echo "Using data type >> fp32"
fi


if [ ${CHECKPOINT_ACTIVATIONS} == "on" ];then
gpt_options="${gpt_options}
--checkpoint-activations --deepspeed-activation-checkpointing --deepspeed --deepspeed_config ${config_json} "
Expand All @@ -84,4 +91,6 @@ fi

run_cmd="deepspeed --hostfile=deepspeed_hosts --num_nodes ${NUM_WORKERS} --num_gpus ${NUM_GPUS_PER_WORKER} pretrain_gpt2.py ${gpt_options} "
echo ${run_cmd}
eval ${run_cmd} 2>&1 | tee ${LOG}
eval ${run_cmd} 2>&1 | tee ${LOG}