diff --git a/OneFlow/ClickThroughRate/DLRM/bsz_test.sh b/OneFlow/ClickThroughRate/DLRM/bsz_test.sh new file mode 100644 index 00000000..139ce7ab --- /dev/null +++ b/OneFlow/ClickThroughRate/DLRM/bsz_test.sh @@ -0,0 +1,10 @@ +test_name=bsz_test +emb_size=16 + +for DEVICE_NUM_PER_NODE in 1 8 +do + for BATHSIZE in 16 64 256 1024 4096 16384 65536 + do + bash dlrm_test.sh ${test_name} ${DEVICE_NUM_PER_NODE} ${BATHSIZE} ${emb_size} + done +done diff --git a/OneFlow/ClickThroughRate/DLRM/dlrm_test.sh b/OneFlow/ClickThroughRate/DLRM/dlrm_test.sh new file mode 100644 index 00000000..d6e50a23 --- /dev/null +++ b/OneFlow/ClickThroughRate/DLRM/dlrm_test.sh @@ -0,0 +1,57 @@ +rm core.* + +test_name=$1 +DEVICE_NUM_PER_NODE=$2 +BATHSIZE=$3 +emb_size=$4 + +MASTER_ADDR=127.0.0.1 +NUM_NODES=1 +NODE_RANK=0 +# DATA_DIR=/dataset/wdl_ofrecord/ofrecord +dataset_format=ofrecord +DATA_DIR=/dataset/f9f659c5/wdl_ofrecord +EMBD_SIZE=33762577 # 33762578 + +# test: 3274330 +# val: 3274328 +# train: 39291958 +eval_batch_size=327432 +eval_batchs=$(( 3274330 / eval_batch_size )) + +test_case=${test_name}_n1g${DEVICE_NUM_PER_NODE}_BATHSIZE${BATHSIZE}_embsize${emb_size} +log_file=${test_case}.log +mem_file=${test_case}.mem +# export CUDA_VISIBLE_DEVICES=1 +export ONEFLOW_DEBUG_MODE=True + +python3 gpu_memory_usage.py 1>log/$mem_file 2>&1 6 and ss[1] == 'iter:' and ss[3] == 'loss:': + latencies.append(float(ss[6].strip())) + + result_dict['gpu_num_per_node'] = int(int(result_dict['batch_size']) / int(result_dict['batch_size_per_proc'])) + result_dict['num_nodes'] = 1 + + if len(latencies) > 2: + latencies.pop(0) + latencies.pop(-1) + + if len(latencies) > 0: + result_dict['latency(ms)'] = sum(latencies) / len(latencies) + else: + result_dict['latency(ms)'] = 'NA' + + mem = extract_mem_info(log_file[:-3] + 'mem') + result_dict['memory_usage(MB)'] = mem + return result_dict + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="flags for OneFlow wide&deep") + parser.add_argument("--benchmark_log_dir", type=str, required=True) + parser.add_argument("--repo", type=str, default='benchmark', help='benchmark or models') + args = parser.parse_args() + + logs_list = sorted(glob.glob(os.path.join(args.benchmark_log_dir, "*.log")), key=os.path.getmtime) + #logs_list = sorted(logs_list) + chunk_list = {} + for log_file in logs_list: + if args.repo == 'benchmark': + test_result = extract_info_from_file(log_file) + else: + test_result = extract_info_from_file_for_models(log_file) + + print(test_result) + json_file = os.path.basename(log_file)[:-4] + # json_file = os.path.basename(log_file)[:-13] + print(json_file) + test_result['log_file'] = json_file + if json_file not in chunk_list.keys(): + chunk_list[json_file] = [] + chunk_list[json_file].append(test_result) + result_list = [] + for log_name, chunk in chunk_list.items(): + latency_list = [] + for single_result in chunk: + if 'latency(ms)' in single_result: + latency_list.append(single_result['latency(ms)']) + tmp_chunk = chunk[0] + tmp_chunk['gpu'] = 'n{}g{}'.format(tmp_chunk['num_nodes'], tmp_chunk['gpu_num_per_node']) + if len(latency_list): + tmp_chunk['latency(ms)'] = median(latency_list) + result_list.append(tmp_chunk) + else: + print('latency is not calculated in ', log_name) + #with open(os.path.join(args.benchmark_log_dir, 'latency_reprot.md'), 'w') as f: + report_file = args.benchmark_log_dir + '_latency_report.md' + with open(report_file, 'w') as f: + titles = ['log_file', 'gpu', 'batch_size', 'vocab_size','embedding_vec_size', 'latency(ms)', 'memory_usage(MB)'] + write_line(f, titles, '|', True) + write_line(f, ['----' for _ in titles], '|', True) + for result in result_list: + if 'latency(ms)' not in result.keys(): + print(result['log_file'], 'is not complete!') + continue + cells = [value_format(result[title]) for title in titles] + write_line(f, cells, '|', True) diff --git a/OneFlow/ClickThroughRate/DLRM/extract_info_from_log.sh b/OneFlow/ClickThroughRate/DLRM/extract_info_from_log.sh new file mode 100644 index 00000000..09a2ae18 --- /dev/null +++ b/OneFlow/ClickThroughRate/DLRM/extract_info_from_log.sh @@ -0,0 +1 @@ +python extract_info_from_log.py --benchmark_log_dir ./log --repo models diff --git a/OneFlow/ClickThroughRate/DLRM/gpu_memory_usage.py b/OneFlow/ClickThroughRate/DLRM/gpu_memory_usage.py new file mode 100644 index 00000000..3b036ebe --- /dev/null +++ b/OneFlow/ClickThroughRate/DLRM/gpu_memory_usage.py @@ -0,0 +1,25 @@ +import time +from pynvml import * + +nvmlInit() +handle = nvmlDeviceGetHandleByIndex(0) +running = True + +mem_threshold = 32*1024*1024 +state = 'init' #'Detecting' + +device0_max_used_mem = 0 +while running == True: + time.sleep(1) + info = nvmlDeviceGetMemoryInfo(handle) + if state == 'init': + if info.used > mem_threshold: + state = 'Detecting' + elif state == 'Detecting': + if info.used < mem_threshold: + running = False + else: + device0_max_used_mem = max(device0_max_used_mem, info.used) + +nvmlShutdown() +print('max device0 memory usage is:', device0_max_used_mem)