Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add batchsize and embsize testing scripts #159

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions OneFlow/ClickThroughRate/DLRM/bsz_test.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
test_name=bsz_test
emb_size=16

for DEVICE_NUM_PER_NODE in 1 8
do
for BATHSIZE in 16 64 256 1024 4096 16384 65536
do
bash dlrm_test.sh ${test_name} ${DEVICE_NUM_PER_NODE} ${BATHSIZE} ${emb_size}
done
done
57 changes: 57 additions & 0 deletions OneFlow/ClickThroughRate/DLRM/dlrm_test.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
rm core.*

test_name=$1
DEVICE_NUM_PER_NODE=$2
BATHSIZE=$3
emb_size=$4

MASTER_ADDR=127.0.0.1
NUM_NODES=1
NODE_RANK=0
# DATA_DIR=/dataset/wdl_ofrecord/ofrecord
dataset_format=ofrecord
DATA_DIR=/dataset/f9f659c5/wdl_ofrecord
EMBD_SIZE=33762577 # 33762578

# test: 3274330
# val: 3274328
# train: 39291958
eval_batch_size=327432
eval_batchs=$(( 3274330 / eval_batch_size ))

test_case=${test_name}_n1g${DEVICE_NUM_PER_NODE}_BATHSIZE${BATHSIZE}_embsize${emb_size}
log_file=${test_case}.log
mem_file=${test_case}.mem
# export CUDA_VISIBLE_DEVICES=1
export ONEFLOW_DEBUG_MODE=True

python3 gpu_memory_usage.py 1>log/$mem_file 2>&1 </dev/null &

python3 -m oneflow.distributed.launch \
--nproc_per_node $DEVICE_NUM_PER_NODE \
--nnodes $NUM_NODES \
--node_rank $NODE_RANK \
--master_addr $MASTER_ADDR \
train.py \
--interaction_type dot \
--dataset_format $dataset_format \
--embedding_type Embedding \
--bottom_mlp 512,256,$emb_size \
--top_mlp 1024,1024,512,256 \
--embedding_vec_size $emb_size \
--learning_rate 0.1 \
--batch_size $BATHSIZE \
--data_dir $DATA_DIR \
--loss_print_every_n_iter 100 \
--eval_interval 1000000 \
--eval_batchs $eval_batchs \
--eval_batch_size $eval_batch_size \
--max_iter 10000 \
--vocab_size $EMBD_SIZE \
--data_part_num 256 \
--data_part_name_suffix_length 5 \
--execution_mode 'graph' \
--test_name 'train_graph_conisitent_'$DEVICE_NUM_PER_NODE'gpu' | tee log/${test_case}.log
# --model_load_dir /tank/model_zoo/dlrm_baseline_params_emb$emb_size \
# --dataset_format torch \
# --model_load_dir /tank/xiexuan/dlrm/initial_parameters \
10 changes: 10 additions & 0 deletions OneFlow/ClickThroughRate/DLRM/emb_size_test.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
test_name=embsize_test
BATHSIZE =32

for DEVICE_NUM_PER_NODE in 1 8
do
for emb_size in 2 8 32
do
bash dlrm_test.sh ${test_name} ${DEVICE_NUM_PER_NODE} ${BATHSIZE} ${emb_size}
done
done
119 changes: 119 additions & 0 deletions OneFlow/ClickThroughRate/DLRM/extract_info_from_log.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
import argparse
import os
import glob
from statistics import median




def write_line(f, lst, separator=',', start_end=False):
lst = ['', *lst, ''] if start_end else lst
f.write(separator.join(lst))
f.write('\n')


def value_format(value):
if isinstance(value, float):
return '{:.3f}'.format(value)
elif isinstance(value, int):
return f'{value:,}'
else:
return str(value)


def extract_mem_info(mem_file):
if not os.path.isfile(mem_file):
return 'NA'

with open(mem_file, 'r') as f:
for line in f.readlines():
ss = line.split(' ')
if len(ss) < 5:
continue
if ss[0] == 'max':
return int(float(ss[-1].strip()) / 1024 /1024)
return 'NA'

def extract_info_from_file_for_models(log_file):
'''
[rank:0] iter: 100/1200, loss: 0.0831875279545784, latency(ms): 81.5818255022168159 | 2021-12-01 13:19:02.625
[rank:0] iter: 200/1200, loss: 0.0780148208141327, latency(ms): 2.2327776625752449 | 2021-12-01 13:19:02.848
...
[rank:0] iter: 1200/1200, loss: 0.0711858719587326, latency(ms): 2.3108293302357197 | 2021-12-01 13:19:05.145
'''
# extract info from file name
result_dict = {}
with open(log_file, 'r') as f:
latencies = []
for line in f.readlines():
ss = line.strip().split(' ')
if ss[0] in ['num_nodes', 'batch_size', 'batch_size_per_proc', 'vocab_size','embedding_vec_size']:
result_dict[ss[0]] = ss[2].strip()
elif len(ss) > 6 and ss[1] == 'iter:' and ss[3] == 'loss:':
latencies.append(float(ss[6].strip()))

result_dict['gpu_num_per_node'] = int(int(result_dict['batch_size']) / int(result_dict['batch_size_per_proc']))
result_dict['num_nodes'] = 1

if len(latencies) > 2:
latencies.pop(0)
latencies.pop(-1)

if len(latencies) > 0:
result_dict['latency(ms)'] = sum(latencies) / len(latencies)
else:
result_dict['latency(ms)'] = 'NA'

mem = extract_mem_info(log_file[:-3] + 'mem')
result_dict['memory_usage(MB)'] = mem
return result_dict


if __name__ == "__main__":
parser = argparse.ArgumentParser(description="flags for OneFlow wide&deep")
parser.add_argument("--benchmark_log_dir", type=str, required=True)
parser.add_argument("--repo", type=str, default='benchmark', help='benchmark or models')
args = parser.parse_args()

logs_list = sorted(glob.glob(os.path.join(args.benchmark_log_dir, "*.log")), key=os.path.getmtime)
#logs_list = sorted(logs_list)
chunk_list = {}
for log_file in logs_list:
if args.repo == 'benchmark':
test_result = extract_info_from_file(log_file)
else:
test_result = extract_info_from_file_for_models(log_file)

print(test_result)
json_file = os.path.basename(log_file)[:-4]
# json_file = os.path.basename(log_file)[:-13]
print(json_file)
test_result['log_file'] = json_file
if json_file not in chunk_list.keys():
chunk_list[json_file] = []
chunk_list[json_file].append(test_result)
result_list = []
for log_name, chunk in chunk_list.items():
latency_list = []
for single_result in chunk:
if 'latency(ms)' in single_result:
latency_list.append(single_result['latency(ms)'])
tmp_chunk = chunk[0]
tmp_chunk['gpu'] = 'n{}g{}'.format(tmp_chunk['num_nodes'], tmp_chunk['gpu_num_per_node'])
if len(latency_list):
tmp_chunk['latency(ms)'] = median(latency_list)
result_list.append(tmp_chunk)
else:
print('latency is not calculated in ', log_name)
#with open(os.path.join(args.benchmark_log_dir, 'latency_reprot.md'), 'w') as f:
report_file = args.benchmark_log_dir + '_latency_report.md'
with open(report_file, 'w') as f:
titles = ['log_file', 'gpu', 'batch_size', 'vocab_size','embedding_vec_size', 'latency(ms)', 'memory_usage(MB)']
write_line(f, titles, '|', True)
write_line(f, ['----' for _ in titles], '|', True)
for result in result_list:
if 'latency(ms)' not in result.keys():
print(result['log_file'], 'is not complete!')
continue
cells = [value_format(result[title]) for title in titles]
write_line(f, cells, '|', True)
1 change: 1 addition & 0 deletions OneFlow/ClickThroughRate/DLRM/extract_info_from_log.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
python extract_info_from_log.py --benchmark_log_dir ./log --repo models
25 changes: 25 additions & 0 deletions OneFlow/ClickThroughRate/DLRM/gpu_memory_usage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import time
from pynvml import *

nvmlInit()
handle = nvmlDeviceGetHandleByIndex(0)
running = True

mem_threshold = 32*1024*1024
state = 'init' #'Detecting'

device0_max_used_mem = 0
while running == True:
time.sleep(1)
info = nvmlDeviceGetMemoryInfo(handle)
if state == 'init':
if info.used > mem_threshold:
state = 'Detecting'
elif state == 'Detecting':
if info.used < mem_threshold:
running = False
else:
device0_max_used_mem = max(device0_max_used_mem, info.used)

nvmlShutdown()
print('max device0 memory usage is:', device0_max_used_mem)