From 608a5ed7542b404dda373ab1b0c2c4937e9806af Mon Sep 17 00:00:00 2001 From: Egor Krivov Date: Tue, 9 Jan 2024 11:05:29 +0100 Subject: [PATCH 01/40] Added bfloat16 run --- cnn.sh | 15 +++++++++------ mlp.sh | 29 +++++++++++++++-------------- 2 files changed, 24 insertions(+), 20 deletions(-) diff --git a/cnn.sh b/cnn.sh index 60ad577..83d5974 100755 --- a/cnn.sh +++ b/cnn.sh @@ -10,11 +10,14 @@ if [[ -z "${DL_BENCH_ARGS}" ]]; then fi CNNS=(vgg16 resnet18 resnet50 resnext50 resnext101 densenet121 mobilenet_v3l) -for BS in 0001 0032 0128 +for DTYPE in float32 bfloat16 do - for name in "${CNNS[@]}" - do - echo "Benchmark $name" - benchmark-run -b cnn -p "name='${name}',batch_size='$BS'" --benchmark_desc "${name}_bs$BS" ${DL_BENCH_ARGS} || echo Failed - done + for BS in 0001 0032 0128 + do + for name in "${CNNS[@]}" + do + echo "Benchmark $name with BS=$BS and DTYPE=$DTYPE" + benchmark-run -b cnn -p "name='${name}',batch_size='$BS'" --dtype "${DTYPE}" --benchmark_desc "${name}_bs$BS" ${DL_BENCH_ARGS} || echo Failed + done + done done diff --git a/mlp.sh b/mlp.sh index e055d93..6db7dfa 100755 --- a/mlp.sh +++ b/mlp.sh @@ -9,21 +9,22 @@ if [[ -z "${DL_BENCH_ARGS}" ]]; then exit 1 fi -echo "Bfloat16 on size5" -benchmark-run -b mlp -p "name='size5',batch_size=1024" --benchmark_desc "size5_bs1024_bfloat16" --dtype bfloat16 ${DL_BENCH_ARGS} || echo Failed - # for size in size5_bn_gelu -for size in size2 size3 size4 size5 size5_sigm size5_tanh size5_gelu size5_linear size5_inplace size5_bn size5_bn_gelu size5_drop_gelu 100@512 25@1024 4@16384 2@16384 +for DTYPE in float32 bfloat16 do - echo "Benchmark $size" - benchmark-run -b mlp -p "name='${size}'" --benchmark_desc "${size}_bs1024" ${DL_BENCH_ARGS} || echo Failed -done + echo "DTYPE=${DTYPE}" + for size in size2 size3 size4 size5 size5_sigm size5_tanh size5_gelu size5_linear size5_inplace size5_bn size5_bn_gelu size5_drop_gelu 100@512 25@1024 4@16384 2@16384 + do + echo "Benchmark $size" + benchmark-run -b mlp -p "name='${size}'" --dtype "${DTYPE}" --benchmark_desc "${size}_bs1024" ${DL_BENCH_ARGS} || echo Failed + done -size="size5" -for BATCH_SIZE in 1 16 256 2048 8196 -do - echo "Batch size $BATCH_SIZE" - echo "Benchmark $size" - BATCH_SIZE_TXT=$(printf "%04d" $BATCH_SIZE) - benchmark-run -b mlp -p "name='${size}',batch_size=${BATCH_SIZE}" --benchmark_desc "${size}_bs${BATCH_SIZE_TXT}" ${DL_BENCH_ARGS} || echo Failed + size="size5" + for BATCH_SIZE in 1 16 256 2048 8196 + do + echo "Batch size $BATCH_SIZE" + echo "Benchmark $size" + BATCH_SIZE_TXT=$(printf "%04d" $BATCH_SIZE) + benchmark-run -b mlp -p "name='${size}',batch_size=${BATCH_SIZE}" --dtype "${DTYPE}" --benchmark_desc "${size}_bs${BATCH_SIZE_TXT}" ${DL_BENCH_ARGS} || echo Failed + done done From d5f9e187da0499787adb1c20b937846c53d1624b Mon Sep 17 00:00:00 2001 From: Egor Krivov Date: Tue, 9 Jan 2024 11:09:22 +0100 Subject: [PATCH 02/40] llm update --- llm.sh | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/llm.sh b/llm.sh index 94fe3ae..13681e1 100755 --- a/llm.sh +++ b/llm.sh @@ -9,5 +9,7 @@ if [[ -z "${DL_BENCH_ARGS}" ]]; then exit 1 fi -benchmark-run -b llm -p "" --benchmark_desc "gptj" --dtype float32 ${DL_BENCH_ARGS} || echo Failed -benchmark-run -b llm -p "" --benchmark_desc "gptj_bfloat16" --dtype bfloat16 ${DL_BENCH_ARGS} || echo Failed +for DTYPE in float32 bfloat16 +do + benchmark-run -b llm -p "" --benchmark_desc "gptj" --dtype "${DTYPE}" ${DL_BENCH_ARGS} || echo Failed +done From 8c397b107121e1b26a6a85a73291d121b1b9fb10 Mon Sep 17 00:00:00 2001 From: Egor Krivov Date: Tue, 9 Jan 2024 13:54:34 +0100 Subject: [PATCH 03/40] fixed --- dl_bench/utils.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/dl_bench/utils.py b/dl_bench/utils.py index af2d41d..4561573 100644 --- a/dl_bench/utils.py +++ b/dl_bench/utils.py @@ -375,9 +375,14 @@ def inference(self, backend: Backend): self.compile(sample, backend) print("Warmup started") + enabled = not (backend.dtype == torch.float32) with torch.no_grad(): self.net.eval() - with tm.timeit("warmup_s"): + with tm.timeit("warmup_s"), torch.autocast( + enabled=enabled, + device_type=backend.device_name, + dtype=backend.dtype, + ): sample = backend.to_device(sample) self.net(sample) self.net(sample) @@ -396,13 +401,11 @@ def inference(self, backend: Backend): for i, x in enumerate(test_loader): s = get_time() x = backend.to_device(x) - if backend.dtype != torch.float32: - with torch.autocast( - device_type=backend.device_name, - dtype=backend.dtype, - ): - y = self.net(x) - else: + with torch.autocast( + enabled=enabled, + device_type=backend.device_name, + dtype=backend.dtype, + ): y = self.net(x) fw_times.append(get_time() - s) From 0ebd42f95e57eae2ce6314a15506c3fe7619f87c Mon Sep 17 00:00:00 2001 From: Egor Krivov Date: Tue, 9 Jan 2024 15:56:30 +0100 Subject: [PATCH 04/40] ipex activated --- .github/workflows/test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 8db6c53..84eb363 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -38,7 +38,7 @@ jobs: {device: 'cpu', compiler: 'torchscript'}, {device: 'cpu', compiler: 'torchscript_onednn'}, {device: 'cpu', compiler: 'ipex'}, - # {device: 'cpu', compiler: 'ipex_onednn_graph'}, + {device: 'cpu', compiler: 'ipex_onednn_graph'}, # {device: 'xpu', compiler: 'ipex'}, {device: 'cpu', compiler: 'torch_mlir'} ] From d26ad5de18b4ced17effb449a553f5f105fd4819 Mon Sep 17 00:00:00 2001 From: Egor Krivov Date: Tue, 9 Jan 2024 16:47:53 +0100 Subject: [PATCH 05/40] reverted cm --- dl_bench/utils.py | 31 ++++++++++++++++++------------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/dl_bench/utils.py b/dl_bench/utils.py index 4561573..0e38307 100644 --- a/dl_bench/utils.py +++ b/dl_bench/utils.py @@ -375,15 +375,18 @@ def inference(self, backend: Backend): self.compile(sample, backend) print("Warmup started") - enabled = not (backend.dtype == torch.float32) - with torch.no_grad(): + with torch.no_grad(), tm.timeit("warmup_s"): self.net.eval() - with tm.timeit("warmup_s"), torch.autocast( - enabled=enabled, - device_type=backend.device_name, - dtype=backend.dtype, - ): - sample = backend.to_device(sample) + sample = backend.to_device(sample) + if backend.dtype != torch.float32: + with torch.autocast( + device_type=backend.device_name, + dtype=backend.dtype, + ): + self.net(sample) + self.net(sample) + self.net(sample) + else: self.net(sample) self.net(sample) self.net(sample) @@ -401,11 +404,13 @@ def inference(self, backend: Backend): for i, x in enumerate(test_loader): s = get_time() x = backend.to_device(x) - with torch.autocast( - enabled=enabled, - device_type=backend.device_name, - dtype=backend.dtype, - ): + if backend.dtype != torch.float32: + with torch.autocast( + device_type=backend.device_name, + dtype=backend.dtype, + ): + y = self.net(x) + else: y = self.net(x) fw_times.append(get_time() - s) From 5c058a9dc44a1549d203d1cc55fae8eba26e7a94 Mon Sep 17 00:00:00 2001 From: Egor Krivov Date: Tue, 6 Feb 2024 17:02:52 +0100 Subject: [PATCH 06/40] first full exp --- scripts/margin.sh | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 scripts/margin.sh diff --git a/scripts/margin.sh b/scripts/margin.sh new file mode 100644 index 0000000..9bb452e --- /dev/null +++ b/scripts/margin.sh @@ -0,0 +1,28 @@ +#!/bin/sh + +set -x + +HOST="test" + +export KMP_AFFINITY="respect,noreset,granularity=fine,balanced" +export OMP_NUM_THREADS=32 +export ONEDNN_VERBOSE=0 + +if [[ -z "${DL_BENCH_ARGS}" ]]; then + echo "Please, provide DL_BENCH_ARGS environment variable" + exit 1 +fi + +CNNS=(resnet50) +for COMPILER in dynamo ipex_onednn_graph +for DTYPE in float32 bfloat16 +do + for BS in 0001 0032 0128 + do + for name in "${CNNS[@]}" + do + echo "Benchmark $name with BS=$BS and DTYPE=$DTYPE" + numactl -N 1 benchmark-run -b cnn -p "name='${name}',batch_size='$BS'" --dtype "${DTYPE}" --benchmark_desc "${name}_bs$BS" --host "${HOST}" -c "${COMPILER}" + done + done +done From 89cf5dabd5179bcb4eb58025353c5c4077da2f8b Mon Sep 17 00:00:00 2001 From: Gregory Shimansky Date: Fri, 19 Jan 2024 10:41:13 -0600 Subject: [PATCH 07/40] Added xsmm backend for torch_mlir compiler (#70) Signed-off-by: Gregory Shimansky --- .github/workflows/test-single-config.yml | 1 + .github/workflows/test.yml | 12 ++---------- dl_bench/cli/launcher.py | 1 + dl_bench/utils.py | 7 +++++-- 4 files changed, 9 insertions(+), 12 deletions(-) diff --git a/.github/workflows/test-single-config.yml b/.github/workflows/test-single-config.yml index da7c9d2..ea2d1de 100644 --- a/.github/workflows/test-single-config.yml +++ b/.github/workflows/test-single-config.yml @@ -21,6 +21,7 @@ on: - torch - dynamo - torch_mlir + - torch_mlir_xsmm - torchscript - torchscript_onednn - ipex diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 84eb363..7a5f307 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -40,7 +40,8 @@ jobs: {device: 'cpu', compiler: 'ipex'}, {device: 'cpu', compiler: 'ipex_onednn_graph'}, # {device: 'xpu', compiler: 'ipex'}, - {device: 'cpu', compiler: 'torch_mlir'} + {device: 'cpu', compiler: 'torch_mlir'}, + {device: 'cpu', compiler: 'torch_mlir_xsmm'} ] test_script: ${{ fromJson(inputs.test_scripts) }} fail-fast: false @@ -56,12 +57,3 @@ jobs: test_script: ${{ matrix.test_script }} secrets: DB_URL: ${{ secrets.DB_URL }} - - shutdown: - needs: mlp_test - if: ${{ contains(inputs.runner_type, 'amd') }} && inputs.shutdown_cloud_runner - runs-on: ${{ inputs.runner_type }} - steps: - - name: shutdown - shell: bash -el {0} - run: sudo shutdown -h +2 diff --git a/dl_bench/cli/launcher.py b/dl_bench/cli/launcher.py index eabcc3b..6115496 100644 --- a/dl_bench/cli/launcher.py +++ b/dl_bench/cli/launcher.py @@ -85,6 +85,7 @@ def parse_args(): "ipex", "ipex_onednn_graph", "torch_mlir", + "torch_mlir_xsmm", ], help="Compilation mode to use. No compilation by default.", ) diff --git a/dl_bench/utils.py b/dl_bench/utils.py index 0e38307..b0c9d8b 100644 --- a/dl_bench/utils.py +++ b/dl_bench/utils.py @@ -268,7 +268,7 @@ def _compile_model(compile_mode: str, device, model: Module, sample_input, dtype compiled_model = dynamo.optimize(be.refbackend_torchdynamo_backend)(model) print("Compiled with torch_mlir (torchscript, inference)") - elif compile_mode == "torch_mlir": + elif compile_mode == "torch_mlir" or compile_mode == "torch_mlir_xsmm": from torch_mlir._dynamo_fx_importer import import_fx_graph_as_func from torch_mlir_e2e_test.configs.torchdynamo import jit from torch_mlir_e2e_test.framework import TestOptions @@ -277,6 +277,9 @@ def _compile_model(compile_mode: str, device, model: Module, sample_input, dtype from torch_mlir_e2e_test.linalg_on_tensors_backends.cpuprotobackend import ( CpuProtoLinalgOnTensorsBackend, ) + from torch_mlir_e2e_test.linalg_on_tensors_backends.xsmmprotobackend import ( + XsmmProtoLinalgOnTensorsBackend, + ) import torch.utils._pytree as pytree # debug_timer seems to cause problems: @@ -290,7 +293,7 @@ def _compile_model(compile_mode: str, device, model: Module, sample_input, dtype opts, output_type="linalg-on-tensors", ) - backend = CpuProtoLinalgOnTensorsBackend(opts) + backend = CpuProtoLinalgOnTensorsBackend(opts) if compile_mode == "torch_mlir" else XsmmProtoLinalgOnTensorsBackend(opts) # backend = RefBackendLinalgOnTensorsBackend() module = backend.compile(module) backend_module = backend.load(module) From 4f7d5c969a84c1571c85bfe1ab564f6247b26794 Mon Sep 17 00:00:00 2001 From: Gregory Shimansky Date: Mon, 22 Jan 2024 10:36:13 -0600 Subject: [PATCH 08/40] Added weekly scheduled runs for all compilers (#72) Signed-off-by: Gregory Shimansky --- .github/workflows/test.yml | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 7a5f307..e542863 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -21,6 +21,9 @@ on: required: false default: '["./mlp.sh", "./cnn.sh", "./llm.sh"]' type: string + schedule: + # Runs at 12pm UTC (6am CST) on every Saturday + - cron: "0 12 * * 6" jobs: print_inputs: @@ -43,15 +46,15 @@ jobs: {device: 'cpu', compiler: 'torch_mlir'}, {device: 'cpu', compiler: 'torch_mlir_xsmm'} ] - test_script: ${{ fromJson(inputs.test_scripts) }} + test_script: ${{ github.event_name == 'workflow_dispatch' && fromJson(inputs.test_scripts) || fromJson('["./mlp.sh", "./cnn.sh", "./llm.sh"]') }} fail-fast: false uses: ./.github/workflows/execute-test-script.yml with: compiler: ${{ matrix.type.compiler }} device: ${{ matrix.type.device }} - tag: ${{ inputs.tag }} - torch_mlir_repo: ${{ inputs.torch_mlir_repo }} - torch_mlir_branch: ${{ inputs.torch_mlir_branch }} + tag: ${{ github.event_name == 'workflow_dispatch' && inputs.tag || 'ci' }} + torch_mlir_repo: ${{ github.event_name == 'workflow_dispatch' && inputs.torch_mlir_repo || 'intel-ai/torch-mlir' }} + torch_mlir_branch: ${{ github.event_name == 'workflow_dispatch' && inputs.torch_mlir_branch || 'cpu-proto' }} runner_type: spr shutdown_cloud_runner: false test_script: ${{ matrix.test_script }} From 1e7731965238e1ae5e6d90d4117305ae21d77722 Mon Sep 17 00:00:00 2001 From: Egor Date: Thu, 8 Feb 2024 11:44:54 +0100 Subject: [PATCH 09/40] Set python==3.11 (#76) --- tests/conda-envs/cpu.yaml | 1 + tests/conda-envs/cuda.yaml | 1 + tests/conda-envs/ipex.yaml | 1 + 3 files changed, 3 insertions(+) diff --git a/tests/conda-envs/cpu.yaml b/tests/conda-envs/cpu.yaml index b769970..3d70190 100644 --- a/tests/conda-envs/cpu.yaml +++ b/tests/conda-envs/cpu.yaml @@ -2,6 +2,7 @@ name: cpu channels: - pytorch dependencies: + - python==3.11 - pytorch - torchvision - torchaudio diff --git a/tests/conda-envs/cuda.yaml b/tests/conda-envs/cuda.yaml index 06c0afa..f83cfb4 100644 --- a/tests/conda-envs/cuda.yaml +++ b/tests/conda-envs/cuda.yaml @@ -3,6 +3,7 @@ channels: - pytorch - nvidia dependencies: + - python==3.11 - pytorch - torchvision - torchaudio diff --git a/tests/conda-envs/ipex.yaml b/tests/conda-envs/ipex.yaml index d136704..d6d5b2d 100644 --- a/tests/conda-envs/ipex.yaml +++ b/tests/conda-envs/ipex.yaml @@ -3,6 +3,7 @@ channels: - intel - conda-forge dependencies: + - python==3.11 - intel-aikit-pytorch - pytorch>=2.0.1=*_xpu_* - intel-extension-for-pytorch From 700a97cc38af4163ec27658f6eec6cb7af42c283 Mon Sep 17 00:00:00 2001 From: Ivy Zhang Date: Thu, 8 Feb 2024 21:11:35 +0800 Subject: [PATCH 10/40] skip 3 warmup steps in benchmarking (#75) --- dl_bench/mlp.py | 5 ++++- dl_bench/utils.py | 4 +++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/dl_bench/mlp.py b/dl_bench/mlp.py index 576198e..c05ddc2 100644 --- a/dl_bench/mlp.py +++ b/dl_bench/mlp.py @@ -87,7 +87,10 @@ def __init__(self, params) -> None: name = params.get("name", "size5") net = get_mlp(n_chans_in=IN_FEAT, n_chans_out=N_CLASSES, name=name) + min_batches = int(params.get("min_batches", 10)) + min_seconds = int(params.get("min_seconds", 10)) super().__init__( - net=net, in_shape=in_shape, dataset=dataset, batch_size=batch_size + net=net, in_shape=in_shape, dataset=dataset, batch_size=batch_size,\ + min_batches=min_batches, min_seconds=min_seconds ) diff --git a/dl_bench/utils.py b/dl_bench/utils.py index b0c9d8b..af9bba7 100644 --- a/dl_bench/utils.py +++ b/dl_bench/utils.py @@ -121,6 +121,8 @@ def str_to_dtype(dtype: str): return torch.float32 elif dtype == "bfloat16": return torch.bfloat16 + elif dtype == "int8": + return torch.qint8 else: raise ValueError(f"Unsupported data type: {dtype}") @@ -415,7 +417,7 @@ def inference(self, backend: Backend): y = self.net(x) else: y = self.net(x) - + if i < 3: continue fw_times.append(get_time() - s) n_items += len(x) outputs.append(y) From 8398a93939c9f58cd6231ee9fcf60e4bd3c2befe Mon Sep 17 00:00:00 2001 From: Egor Date: Thu, 8 Feb 2024 14:23:09 +0100 Subject: [PATCH 11/40] Disabled background batch processing (#77) --- dl_bench/utils.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dl_bench/utils.py b/dl_bench/utils.py index af9bba7..28776e5 100644 --- a/dl_bench/utils.py +++ b/dl_bench/utils.py @@ -34,10 +34,10 @@ def get_inf_loaders(n, in_shape, batch_size, device: str): ds = RandomInfDataset(n, in_shape) train_loader = DataLoader( - ds, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=pin_memory + ds, batch_size=batch_size, shuffle=True, num_workers=0, pin_memory=pin_memory ) test_loader = DataLoader( - ds, batch_size=batch_size, shuffle=False, num_workers=4, pin_memory=pin_memory + ds, batch_size=batch_size, shuffle=False, num_workers=0, pin_memory=pin_memory ) return train_loader, test_loader @@ -363,7 +363,7 @@ def inference(self, backend: Backend): self.dataset, batch_size=self.batch_size, shuffle=False, - num_workers=4, + num_workers=0, pin_memory=backend.device_name == "cuda", ) From 1b06c2d4cc9092373e5f04c8a612f84eb1ca74f5 Mon Sep 17 00:00:00 2001 From: Egor Date: Thu, 8 Feb 2024 17:28:39 +0100 Subject: [PATCH 12/40] Removed python311 from ipex (#79) --- tests/conda-envs/ipex.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/conda-envs/ipex.yaml b/tests/conda-envs/ipex.yaml index d6d5b2d..d136704 100644 --- a/tests/conda-envs/ipex.yaml +++ b/tests/conda-envs/ipex.yaml @@ -3,7 +3,6 @@ channels: - intel - conda-forge dependencies: - - python==3.11 - intel-aikit-pytorch - pytorch>=2.0.1=*_xpu_* - intel-extension-for-pytorch From 8029719f785e842d824ef4b1b02943cce504a87e Mon Sep 17 00:00:00 2001 From: Egor Date: Thu, 8 Feb 2024 17:29:26 +0100 Subject: [PATCH 13/40] Refactored warmup, increased dataset size for MLP (#78) --- dl_bench/mlp.py | 6 +++--- dl_bench/utils.py | 30 +++++++++--------------------- 2 files changed, 12 insertions(+), 24 deletions(-) diff --git a/dl_bench/mlp.py b/dl_bench/mlp.py index c05ddc2..93845a8 100644 --- a/dl_bench/mlp.py +++ b/dl_bench/mlp.py @@ -81,8 +81,8 @@ def __init__(self, params) -> None: batch_size = int(params.get("batch_size", 1024)) - min_batches = 10 - DATASET_SIZE = max(10_240, batch_size * min_batches) + min_batches = 20 + DATASET_SIZE = max(102_400, batch_size * min_batches) dataset = RandomInfDataset(DATASET_SIZE, in_shape) name = params.get("name", "size5") @@ -92,5 +92,5 @@ def __init__(self, params) -> None: super().__init__( net=net, in_shape=in_shape, dataset=dataset, batch_size=batch_size,\ - min_batches=min_batches, min_seconds=min_seconds + min_batches=min_batches, min_seconds=min_seconds, warmup_batches=10 ) diff --git a/dl_bench/utils.py b/dl_bench/utils.py index 28776e5..9300fa2 100644 --- a/dl_bench/utils.py +++ b/dl_bench/utils.py @@ -343,12 +343,13 @@ def _get_device(device_name): class Benchmark: def __init__( - self, net, in_shape, dataset, batch_size, min_batches=10, min_seconds=10 + self, net, in_shape, dataset, batch_size, min_batches=10, min_seconds=10, warmup_batches=3, ) -> None: self.net = net self.in_shape = in_shape self.dataset = dataset self.batch_size = batch_size + self.warmup_batches = warmup_batches self.min_batches = min_batches self.min_seconds = min_seconds @@ -379,24 +380,6 @@ def inference(self, backend: Backend): sample = next(iter(test_loader)) self.compile(sample, backend) - print("Warmup started") - with torch.no_grad(), tm.timeit("warmup_s"): - self.net.eval() - sample = backend.to_device(sample) - if backend.dtype != torch.float32: - with torch.autocast( - device_type=backend.device_name, - dtype=backend.dtype, - ): - self.net(sample) - self.net(sample) - self.net(sample) - else: - self.net(sample) - self.net(sample) - self.net(sample) - print("Warmup done") - n_items = 0 self.net.eval() @@ -417,7 +400,11 @@ def inference(self, backend: Backend): y = self.net(x) else: y = self.net(x) - if i < 3: continue + + if i < self.warmup_batches: + start = time.perf_counter() + continue + fw_times.append(get_time() - s) n_items += len(x) outputs.append(y) @@ -425,7 +412,7 @@ def inference(self, backend: Backend): # early stopping if we have 10+ batches and were running for 10+ seconds if ( (time.perf_counter() - start) > self.min_seconds - and n_items > self.batch_size * self.min_batches + and n_items >= self.batch_size * self.min_batches ): break @@ -437,6 +424,7 @@ def inference(self, backend: Backend): ) results = tm.get_results() + results["duration_s"] = get_time() - start results["samples_per_s"] = n_items / sum(fw_times) results["flops_per_sample"] = self.flops_per_sample From 3cd8a8fc96262196acfa21158c496d46b4044c7f Mon Sep 17 00:00:00 2001 From: Egor Krivov Date: Mon, 12 Feb 2024 10:38:47 +0000 Subject: [PATCH 14/40] update --- dl_bench/llm.py | 7 ++++--- dl_bench/utils.py | 4 +++- scripts/margin.sh | 27 ++++++++++++--------------- 3 files changed, 19 insertions(+), 19 deletions(-) mode change 100644 => 100755 scripts/margin.sh diff --git a/dl_bench/llm.py b/dl_bench/llm.py index 1f01256..d4001cf 100644 --- a/dl_bench/llm.py +++ b/dl_bench/llm.py @@ -1,6 +1,7 @@ import time import torch +import intel_extension_for_pytorch as ipex from transformers import AutoModelForCausalLM, AutoTokenizer from dl_bench.utils import TimerManager, Benchmark, str_to_dtype @@ -12,7 +13,7 @@ def get_llm(name, dtype): model_name = "EleutherAI/gpt-j-6B" - model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=dtype) + model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=dtype, torchscript=True) tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B") return tokenizer, model @@ -53,11 +54,11 @@ def inference(self, backend): print("Warmup started") with torch.inference_mode(), tm.timeit("warmup_s"): - self.model.eval() + # self.model.eval() self.generate(self.warmup_prompt) print("Warmup done") - self.model.eval() + # self.model.eval() enabled = backend.dtype != torch.float32 with torch.inference_mode(), torch.autocast( enabled=enabled, device_type=backend.device_name diff --git a/dl_bench/utils.py b/dl_bench/utils.py index 9300fa2..57a77a0 100644 --- a/dl_bench/utils.py +++ b/dl_bench/utils.py @@ -184,7 +184,9 @@ def _compile_transformer_model(compile_mode, model, dtype=torch.bfloat16): import intel_extension_for_pytorch as ipex params = {} if dtype != torch.bfloat16 else {"dtype": torch.bfloat16} - compiled_model = ipex.optimize_transformers(model, **params) + #compiled_model = ipex.llm.optimize(model, **params, inplace=True, deployment_mode=True) + compiled_model = ipex.llm.optimize(model, **params) + # compiled_model = ipex.optimize_transformers(model, **params) print("Compiled with ipex") elif compile_mode == "ipex_onednn_graph": raise NotImplementedError() diff --git a/scripts/margin.sh b/scripts/margin.sh old mode 100644 new mode 100755 index 9bb452e..4fa8905 --- a/scripts/margin.sh +++ b/scripts/margin.sh @@ -1,4 +1,4 @@ -#!/bin/sh +#!/bin/bash set -x @@ -8,21 +8,18 @@ export KMP_AFFINITY="respect,noreset,granularity=fine,balanced" export OMP_NUM_THREADS=32 export ONEDNN_VERBOSE=0 -if [[ -z "${DL_BENCH_ARGS}" ]]; then - echo "Please, provide DL_BENCH_ARGS environment variable" - exit 1 -fi - CNNS=(resnet50) for COMPILER in dynamo ipex_onednn_graph -for DTYPE in float32 bfloat16 do - for BS in 0001 0032 0128 - do - for name in "${CNNS[@]}" - do - echo "Benchmark $name with BS=$BS and DTYPE=$DTYPE" - numactl -N 1 benchmark-run -b cnn -p "name='${name}',batch_size='$BS'" --dtype "${DTYPE}" --benchmark_desc "${name}_bs$BS" --host "${HOST}" -c "${COMPILER}" - done - done + for DTYPE in float32 bfloat16 + do + for BS in 0001 0032 0128 + do + for name in "${CNNS[@]}" + do + echo "Benchmark $name with BS=$BS and DTYPE=$DTYPE" + numactl -N 1 benchmark-run -b cnn -p "name='${name}',batch_size='$BS'" --dtype "${DTYPE}" --benchmark_desc "${name}_bs$BS" --host "${HOST}" -c "${COMPILER}" + done + done + done done From eee627bc036eee00e3d590967cec027a94a6d366 Mon Sep 17 00:00:00 2001 From: Egor Krivov Date: Mon, 12 Feb 2024 16:56:23 +0000 Subject: [PATCH 15/40] update --- dl_bench/cli/launcher.py | 1 + dl_bench/utils.py | 2 ++ scripts/margin.sh | 2 +- 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/dl_bench/cli/launcher.py b/dl_bench/cli/launcher.py index 6115496..92aa295 100644 --- a/dl_bench/cli/launcher.py +++ b/dl_bench/cli/launcher.py @@ -194,6 +194,7 @@ def main(): / (10**12) ) ) + print("FPS: {:.1f}".format(results.get("samples_per_s", 0))) pprint.pprint(report) if args.output is not None: diff --git a/dl_bench/utils.py b/dl_bench/utils.py index 57a77a0..1f43e84 100644 --- a/dl_bench/utils.py +++ b/dl_bench/utils.py @@ -145,6 +145,7 @@ def __init__(self, device, compiler, dtype="float32") -> None: self.dtype = str_to_dtype(dtype) def to_device(self, x: torch.Tensor): + x = x.contiguous(memory_format=torch.channels_last) if self.device_name in ("cuda", "xpu"): return x.to(self.device) elif self.device_name == "cpu": @@ -230,6 +231,7 @@ def _compile_model(compile_mode: str, device, model: Module, sample_input, dtype # enable oneDNN graph fusion globally torch.jit.enable_onednn_fusion(True) compiled_model = torch.jit.trace(model, sample_input) + compiled_model = torch.jit.freeze(compiled_model) print("Compiled with torchscript onednn") elif compile_mode == "ipex": diff --git a/scripts/margin.sh b/scripts/margin.sh index 4fa8905..abbeced 100755 --- a/scripts/margin.sh +++ b/scripts/margin.sh @@ -18,7 +18,7 @@ do for name in "${CNNS[@]}" do echo "Benchmark $name with BS=$BS and DTYPE=$DTYPE" - numactl -N 1 benchmark-run -b cnn -p "name='${name}',batch_size='$BS'" --dtype "${DTYPE}" --benchmark_desc "${name}_bs$BS" --host "${HOST}" -c "${COMPILER}" + numactl -N 1 benchmark-run -b cnn -p "name='${name}',batch_size='$BS'" --dtype "${DTYPE}" --benchmark_desc "${name}_bs$BS" --host "${HOST}" -c "${COMPILER}" --verbose --skip_verification done done done From 95a20101a0d5d463cbec9fe3bcee125cdd76c443 Mon Sep 17 00:00:00 2001 From: Egor Date: Mon, 12 Feb 2024 18:00:47 +0100 Subject: [PATCH 16/40] Added sync for nvidia backend (#84) --- dl_bench/utils.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/dl_bench/utils.py b/dl_bench/utils.py index 1f43e84..f28d186 100644 --- a/dl_bench/utils.py +++ b/dl_bench/utils.py @@ -153,6 +153,10 @@ def to_device(self, x: torch.Tensor): else: raise ValueError("Unknown device") + def sync(self): + if self.device_name == 'cuda': + torch.cuda.synchronize() + def prepare_eval_transformer(self, model): model = model.to(memory_format=torch.channels_last) @@ -394,6 +398,7 @@ def inference(self, backend: Backend): # Duration is inconsistent now with tm.timeit("duration_s"): for i, x in enumerate(test_loader): + backend.sync() s = get_time() x = backend.to_device(x) if backend.dtype != torch.float32: @@ -409,6 +414,7 @@ def inference(self, backend: Backend): start = time.perf_counter() continue + backend.sync() fw_times.append(get_time() - s) n_items += len(x) outputs.append(y) From 7c1ef81ab80c6fdeabbbf802eacb2fd831eec71f Mon Sep 17 00:00:00 2001 From: Egor Date: Tue, 13 Feb 2024 15:19:20 +0100 Subject: [PATCH 17/40] Added sql code for clean view (#86) --- db_tools/create_view.sql | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 db_tools/create_view.sql diff --git a/db_tools/create_view.sql b/db_tools/create_view.sql new file mode 100644 index 0000000..bea9518 --- /dev/null +++ b/db_tools/create_view.sql @@ -0,0 +1,16 @@ +CREATE OR REPLACE VIEW torchmlir_benchmark_view AS +SELECT + id, + REPLACE(REPLACE(CONCAT(host, '-', compiler, '-', dtype, '-', tag), 'torchscript', 'ts'), '-ci', '') AS backend, + host, + device, + compiler, + dtype, + tag, + benchmark, + benchmark_desc, + samples_per_s AS items_per_s, + flops_per_sample, + flops_per_sample * samples_per_s / 1e12 AS tflops, + date +FROM torchmlir_benchmark; From 09f178e13bfa45a33d5775c2bbcd1473cd469b79 Mon Sep 17 00:00:00 2001 From: Egor Date: Tue, 13 Feb 2024 18:00:18 +0100 Subject: [PATCH 18/40] Added more measurement info like p50, p90 (#87) --- .github/workflows/execute-test-script.yml | 2 +- .github/workflows/test-single-config.yml | 1 - dl_bench/cli/launcher.py | 23 ++++---- dl_bench/utils.py | 69 +++++++++++++---------- 4 files changed, 49 insertions(+), 46 deletions(-) diff --git a/.github/workflows/execute-test-script.yml b/.github/workflows/execute-test-script.yml index 20ac472..5d1f23c 100644 --- a/.github/workflows/execute-test-script.yml +++ b/.github/workflows/execute-test-script.yml @@ -116,7 +116,7 @@ jobs: URL="--url ${{ secrets.DB_URL }}" fi - export DL_BENCH_ARGS="--host ${{ inputs.runner_type }} --compiler ${{ inputs.compiler }} --device ${{ inputs.device }} --tag ${{ inputs.tag }} -v ${URL}" + export DL_BENCH_ARGS="--host ${{ inputs.runner_type }} --compiler ${{ inputs.compiler }} --device ${{ inputs.device }} --tag ${{ inputs.tag }} ${URL}" # We mainly want to verify our own backend if [[ ${{ inputs.compiler }} != *torch_mlir* ]]; then diff --git a/.github/workflows/test-single-config.yml b/.github/workflows/test-single-config.yml index ea2d1de..3e8b323 100644 --- a/.github/workflows/test-single-config.yml +++ b/.github/workflows/test-single-config.yml @@ -78,7 +78,6 @@ jobs: torch_mlir_repo: ${{ inputs.torch_mlir_repo }} torch_mlir_branch: ${{ inputs.torch_mlir_branch }} runner_type: ${{ inputs.runner_type }} - shutdown_cloud_runner: ${{ inputs.shutdown_cloud_runner }} test_script: ${{ matrix.test_script }} secrets: DB_URL: ${{ secrets.DB_URL }} diff --git a/dl_bench/cli/launcher.py b/dl_bench/cli/launcher.py index 92aa295..fcf9025 100644 --- a/dl_bench/cli/launcher.py +++ b/dl_bench/cli/launcher.py @@ -110,9 +110,6 @@ def parse_args(): parser.add_argument( "-o", "--output", required=False, help="Path to output report file." ) - parser.add_argument( - "-v", "--verbose", required=False, action="store_true", help="Verbose mode." - ) parser.add_argument( "--skip_verification", required=False, @@ -185,17 +182,17 @@ def main(): db = BenchmarkDb(args.url) - if args.verbose: - print("Report:") - print( - "TFLOPS: {:.3}".format( - results.get("flops_per_sample", 0) - * results.get("samples_per_s", 0) - / (10**12) - ) + print("Report:") + print("FPS: {:.1f}".format(results.get("samples_per_s", 0))) + print( + "TFLOPS: {:.3}".format( + results.get("flops_per_sample", 0) + * results.get("samples_per_s", 0) + / (10**12) ) - print("FPS: {:.1f}".format(results.get("samples_per_s", 0))) - pprint.pprint(report) + ) + pprint.pprint(report) + pprint.pprint(results) if args.output is not None: with open(args.output, "w", encoding="UTF-8") as out: diff --git a/dl_bench/utils.py b/dl_bench/utils.py index f28d186..46b2386 100644 --- a/dl_bench/utils.py +++ b/dl_bench/utils.py @@ -389,54 +389,61 @@ def inference(self, backend: Backend): self.compile(sample, backend) n_items = 0 - - self.net.eval() outputs = [] fw_times = [] + + self.net.eval() with torch.no_grad(): start = time.perf_counter() - # Duration is inconsistent now - with tm.timeit("duration_s"): - for i, x in enumerate(test_loader): - backend.sync() - s = get_time() - x = backend.to_device(x) - if backend.dtype != torch.float32: - with torch.autocast( - device_type=backend.device_name, - dtype=backend.dtype, - ): - y = self.net(x) - else: + for i, x in enumerate(test_loader): + backend.sync() + s = get_time() + x = backend.to_device(x) + if backend.dtype != torch.float32: + with torch.autocast( + device_type=backend.device_name, + dtype=backend.dtype, + ): y = self.net(x) + else: + y = self.net(x) - if i < self.warmup_batches: - start = time.perf_counter() - continue + backend.sync() - backend.sync() - fw_times.append(get_time() - s) - n_items += len(x) - outputs.append(y) + if i < self.warmup_batches: + # We restart timer because that was just a warmup + start = time.perf_counter() + continue - # early stopping if we have 10+ batches and were running for 10+ seconds - if ( - (time.perf_counter() - start) > self.min_seconds - and n_items >= self.batch_size * self.min_batches - ): - break + fw_times.append(get_time() - s) + n_items += len(x) + outputs.append(y) + + # early stopping if we have 10+ batches and were running for 10+ seconds + if ( + (time.perf_counter() - start) > self.min_seconds + and n_items >= self.batch_size * self.min_batches + ): + break + + if (get_time() - start) > max_time: + break - if (get_time() - start) > max_time: - break + stop = get_time() print( f"Latency 0%-5%-50%-95%-100% are: {np.percentile(fw_times, [0, 5, 50, 95, 100])}" ) results = tm.get_results() - results["duration_s"] = get_time() - start + results["duration_s"] = stop - start results["samples_per_s"] = n_items / sum(fw_times) + results["dirty_items_per_s"] = n_items / results["duration_s"] results["flops_per_sample"] = self.flops_per_sample + results["n_items"] = n_items + results["p50"] = np.percentile(fw_times, 50) + results["p90"] = np.percentile(fw_times, 90) + results["p100"] = max(fw_times) return results, outputs From 0f01f6bc74a8a223525cb1f31261888d83caeef5 Mon Sep 17 00:00:00 2001 From: Egor Date: Tue, 13 Feb 2024 18:24:58 +0100 Subject: [PATCH 19/40] Fixed llm code for nvidia (#85) --- dl_bench/llm.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/dl_bench/llm.py b/dl_bench/llm.py index d4001cf..5cf426f 100644 --- a/dl_bench/llm.py +++ b/dl_bench/llm.py @@ -32,12 +32,15 @@ def __init__(self, params) -> None: "num_beams": 4, } - def generate(self, prompt): + def generate(self, prompt, backend): input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids + backend.sync() start = time.perf_counter() + input_ids = backend.to_device(input_ids) gen_tokens = self.model.generate( input_ids, **self.gen_kwargs, pad_token_id=self.tokenizer.eos_token_id ) + backend.sync() total_time = time.perf_counter() - start # text = self.tokenizer.batch_decode(gen_tokens)[0] @@ -54,8 +57,8 @@ def inference(self, backend): print("Warmup started") with torch.inference_mode(), tm.timeit("warmup_s"): - # self.model.eval() - self.generate(self.warmup_prompt) + self.model.eval() + self.generate(self.warmup_prompt, backend) print("Warmup done") # self.model.eval() @@ -63,7 +66,7 @@ def inference(self, backend): with torch.inference_mode(), torch.autocast( enabled=enabled, device_type=backend.device_name ), tm.timeit("duration_s"): - tokens, total_time = self.generate(self.prompt) + tokens, total_time = self.generate(self.prompt, backend) outputs = [tokens] results = tm.get_results() From 0710c6efdf58c049c5e77154629a84eebd8c3c13 Mon Sep 17 00:00:00 2001 From: Egor Date: Mon, 12 Feb 2024 18:00:47 +0100 Subject: [PATCH 20/40] Added sync for nvidia backend (#84) --- .vscode/settings.json | 3 +++ cpu-dynamo-cnnsh-results.db/results.db | Bin 0 -> 8192 bytes cpu-dynamo-mlpsh-results.db/results.db | Bin 0 -> 8192 bytes dl_bench/results.db | Bin 0 -> 8192 bytes scripts/margin_setup.sh | 5 +++++ 5 files changed, 8 insertions(+) create mode 100644 .vscode/settings.json create mode 100644 cpu-dynamo-cnnsh-results.db/results.db create mode 100644 cpu-dynamo-mlpsh-results.db/results.db create mode 100644 dl_bench/results.db create mode 100644 scripts/margin_setup.sh diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..457f44d --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,3 @@ +{ + "python.analysis.typeCheckingMode": "basic" +} \ No newline at end of file diff --git a/cpu-dynamo-cnnsh-results.db/results.db b/cpu-dynamo-cnnsh-results.db/results.db new file mode 100644 index 0000000000000000000000000000000000000000..a06c830f45de28b2aa8f5770fbc1961cc0a4f939 GIT binary patch literal 8192 zcmeH~%}>-o7{;f7Ss~#gSrbEyW@sS0gb(|n2!yKB&B zL_PQ?a8XZsFpyB0j?bOAzKSA_^0s=q)2mk>f00e*l z5C8%|00;nqBm$3?nACAmWFD6zQmR;y7Z@u}#SZk6<5zxh`K64{=cD2BxGD^{rc zR*-0E00e*l5C8%| z00;m9AOHk_z<*9)J=5HU#PmR#obFC{pGc)rM+8m~hj?K~7{ffD6Zjk_kRS@I(6yqo zoL&k!P81B+y~Dc1vDh4DcQkeovn3K4D|%?(-cn*KdzU!o$QeU7{HkuQ(M{fVgRZxi z^aV>IQIIm{&7ad_Na`&te0+WiNlV3d{Ald~V!l87QD4tA?UR(tap!iE@}rHD@}q}M ziU)d%oj31Gy~8av^XsNmefjjo+b<46pEfe}^&Tcp+Lyl8fGIzgW*jQ014CxMbngs5JgdOat)B{plEUv?j2OP?p1N>LjBtinWBdtQ{)E)0w4eaAOHd& z00JNY0w4eaAkaq=CsxY)^(C`y^!QZM z@q_5E80KwDH>3NyW4l(%^r9z>V8uP+1$VG#FT0wLW{MSw*#GOl?BT$2+FhGIw%wNL zYp&^a?bln5`=`4%+2#H7%3MiO7SVNi5oIi6VsIino5zM{sT0BQ7xD)M0w4eaAOHd& z00JNY0w4eaAOHeK3am4+5lUdP%v5%a89S9oB;q{F3zHl_$xk7U%kzAm&0#O_t-Iz$ zEx_%jR`WTQ&)F?^HQmOxna-p1-){OIwv9_UUDMrV%~&A=&i)2j?=Q)D7Y70_p5FX$ zj(xo*4n6$w)k2i`?Cta8pLTY}UzAhp%s`J+Ts}8-n98>rc66$++kdJv)mO3i8*Adp zW4~6{x7Wlck2ZGYt*3jbl607AoXJL6ae0A1OcllSA6d@&yIW(FnEHO%e)o}N`DwiT LygfR;`6l%n!p=*x literal 0 HcmV?d00001 diff --git a/dl_bench/results.db b/dl_bench/results.db new file mode 100644 index 0000000000000000000000000000000000000000..064d103a8d029f1d275998ec039bdd255aaf11d3 GIT binary patch literal 8192 zcmeI0%WB&|6oyA8U}6Xjq$aDvAPBL+DY~>FID|%Vl+>+kr?T9V)pRV4V>OZ`q;X4I z3N2)jZ6Bae=(fu~Rp|?~tB!^dY<`;O5$sn4zA^b9{W?AAlW zu4#%gpZt4O;<41df$s(u64Qi+DSp)Nb?0s}r0zMJc-I^?j#9q(6py|1UA8#yMq%s+ zELrUE`0MzJ4PMbCoW>-@M^?`e?EmLKO(^%HNntXjVH~iO#4IVykAf(kWpii2{wH?3 zW7wzov3ZJXzE`i*^Vf00e*l5C8%|00;m9AOHk_01&vEz=v&V73tN^ z+Oys2+WMVxxxAz)n!crKTgnbrm8PaPl^xDf9tJU)(2(tIrv4jtf3q>9+#Qo_vZpHA zJ&}JaX9l^6<=i4Su>8*?m7HTg3(H{Kjy{L-&@7I5R T`|(-ymtWq0yh!AYueZ@};5GDw literal 0 HcmV?d00001 diff --git a/scripts/margin_setup.sh b/scripts/margin_setup.sh new file mode 100644 index 0000000..b3411b9 --- /dev/null +++ b/scripts/margin_setup.sh @@ -0,0 +1,5 @@ +wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh +sh -y ./Miniconda3-latest-Linux-x86_64.sh + +# get github repo +conda install gh -c conda-forge --solver libmamba \ No newline at end of file From 2056ace335b6594a98e18b78ab0d86eb3b6da404 Mon Sep 17 00:00:00 2001 From: Egor Krivov Date: Thu, 15 Feb 2024 13:25:28 +0100 Subject: [PATCH 21/40] cleaned --- .vscode/settings.json | 3 --- cpu-dynamo-cnnsh-results.db/results.db | Bin 8192 -> 0 bytes cpu-dynamo-mlpsh-results.db/results.db | Bin 8192 -> 0 bytes 3 files changed, 3 deletions(-) delete mode 100644 .vscode/settings.json delete mode 100644 cpu-dynamo-cnnsh-results.db/results.db delete mode 100644 cpu-dynamo-mlpsh-results.db/results.db diff --git a/.vscode/settings.json b/.vscode/settings.json deleted file mode 100644 index 457f44d..0000000 --- a/.vscode/settings.json +++ /dev/null @@ -1,3 +0,0 @@ -{ - "python.analysis.typeCheckingMode": "basic" -} \ No newline at end of file diff --git a/cpu-dynamo-cnnsh-results.db/results.db b/cpu-dynamo-cnnsh-results.db/results.db deleted file mode 100644 index a06c830f45de28b2aa8f5770fbc1961cc0a4f939..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 8192 zcmeH~%}>-o7{;f7Ss~#gSrbEyW@sS0gb(|n2!yKB&B zL_PQ?a8XZsFpyB0j?bOAzKSA_^0s=q)2mk>f00e*l z5C8%|00;nqBm$3?nACAmWFD6zQmR;y7Z@u}#SZk6<5zxh`K64{=cD2BxGD^{rc zR*-0E00e*l5C8%| z00;m9AOHk_z<*9)J=5HU#PmR#obFC{pGc)rM+8m~hj?K~7{ffD6Zjk_kRS@I(6yqo zoL&k!P81B+y~Dc1vDh4DcQkeovn3K4D|%?(-cn*KdzU!o$QeU7{HkuQ(M{fVgRZxi z^aV>IQIIm{&7ad_Na`&te0+WiNlV3d{Ald~V!l87QD4tA?UR(tap!iE@}rHD@}q}M ziU)d%oj31Gy~8av^XsNmefjjo+b<46pEfe}^&Tcp+Lyl8fGIzgW*jQ014CxMbngs5JgdOat)B{plEUv?j2OP?p1N>LjBtinWBdtQ{)E)0w4eaAOHd& z00JNY0w4eaAkaq=CsxY)^(C`y^!QZM z@q_5E80KwDH>3NyW4l(%^r9z>V8uP+1$VG#FT0wLW{MSw*#GOl?BT$2+FhGIw%wNL zYp&^a?bln5`=`4%+2#H7%3MiO7SVNi5oIi6VsIino5zM{sT0BQ7xD)M0w4eaAOHd& z00JNY0w4eaAOHeK3am4+5lUdP%v5%a89S9oB;q{F3zHl_$xk7U%kzAm&0#O_t-Iz$ zEx_%jR`WTQ&)F?^HQmOxna-p1-){OIwv9_UUDMrV%~&A=&i)2j?=Q)D7Y70_p5FX$ zj(xo*4n6$w)k2i`?Cta8pLTY}UzAhp%s`J+Ts}8-n98>rc66$++kdJv)mO3i8*Adp zW4~6{x7Wlck2ZGYt*3jbl607AoXJL6ae0A1OcllSA6d@&yIW(FnEHO%e)o}N`DwiT LygfR;`6l%n!p=*x From 6d5e68a391e07308e8a6ad1efbfe2fbc40dc9537 Mon Sep 17 00:00:00 2001 From: Egor Krivov Date: Thu, 15 Feb 2024 13:44:00 +0100 Subject: [PATCH 22/40] Merged --- dl_bench/mlp.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/dl_bench/mlp.py b/dl_bench/mlp.py index 96680a2..9715197 100644 --- a/dl_bench/mlp.py +++ b/dl_bench/mlp.py @@ -89,8 +89,6 @@ def __init__(self, params) -> None: name = params.get("name", "size5") net = get_mlp(n_chans_in=IN_FEAT, n_chans_out=N_CLASSES, name=name) - min_batches = int(params.get("min_batches", 10)) - min_seconds = int(params.get("min_seconds", 10)) super().__init__( net=net, From 2d287416dd4a99a2d84e8004c1d4abdac387b73e Mon Sep 17 00:00:00 2001 From: Egor Date: Mon, 12 Feb 2024 18:00:47 +0100 Subject: [PATCH 23/40] Added sync for nvidia backend (#84) --- dl_bench/utils.py | 36 ++++++++++++++++++++++++++++-------- 1 file changed, 28 insertions(+), 8 deletions(-) diff --git a/dl_bench/utils.py b/dl_bench/utils.py index 46b2386..c9e6769 100644 --- a/dl_bench/utils.py +++ b/dl_bench/utils.py @@ -395,14 +395,34 @@ def inference(self, backend: Backend): self.net.eval() with torch.no_grad(): start = time.perf_counter() - for i, x in enumerate(test_loader): - backend.sync() - s = get_time() - x = backend.to_device(x) - if backend.dtype != torch.float32: - with torch.autocast( - device_type=backend.device_name, - dtype=backend.dtype, + # Duration is inconsistent now + with tm.timeit("duration_s"): + for i, x in enumerate(test_loader): + backend.sync() + s = get_time() + x = backend.to_device(x) + if backend.dtype != torch.float32: + with torch.autocast( + device_type=backend.device_name, + dtype=backend.dtype, + ): + y = self.net(x) + else: + y = self.net(x) + + if i < self.warmup_batches: + start = time.perf_counter() + continue + + backend.sync() + fw_times.append(get_time() - s) + n_items += len(x) + outputs.append(y) + + # early stopping if we have 10+ batches and were running for 10+ seconds + if ( + (time.perf_counter() - start) > self.min_seconds + and n_items >= self.batch_size * self.min_batches ): y = self.net(x) else: From a295084e8c70be334b6ee671cd0f13d96e73ab18 Mon Sep 17 00:00:00 2001 From: Egor Date: Tue, 13 Feb 2024 18:00:18 +0100 Subject: [PATCH 24/40] Added more measurement info like p50, p90 (#87) --- dl_bench/utils.py | 36 ++++++++---------------------------- 1 file changed, 8 insertions(+), 28 deletions(-) diff --git a/dl_bench/utils.py b/dl_bench/utils.py index c9e6769..46b2386 100644 --- a/dl_bench/utils.py +++ b/dl_bench/utils.py @@ -395,34 +395,14 @@ def inference(self, backend: Backend): self.net.eval() with torch.no_grad(): start = time.perf_counter() - # Duration is inconsistent now - with tm.timeit("duration_s"): - for i, x in enumerate(test_loader): - backend.sync() - s = get_time() - x = backend.to_device(x) - if backend.dtype != torch.float32: - with torch.autocast( - device_type=backend.device_name, - dtype=backend.dtype, - ): - y = self.net(x) - else: - y = self.net(x) - - if i < self.warmup_batches: - start = time.perf_counter() - continue - - backend.sync() - fw_times.append(get_time() - s) - n_items += len(x) - outputs.append(y) - - # early stopping if we have 10+ batches and were running for 10+ seconds - if ( - (time.perf_counter() - start) > self.min_seconds - and n_items >= self.batch_size * self.min_batches + for i, x in enumerate(test_loader): + backend.sync() + s = get_time() + x = backend.to_device(x) + if backend.dtype != torch.float32: + with torch.autocast( + device_type=backend.device_name, + dtype=backend.dtype, ): y = self.net(x) else: From a349d0be43a9b5aed7cfa68e17794464cf394fd3 Mon Sep 17 00:00:00 2001 From: Egor Krivov Date: Thu, 15 Feb 2024 13:48:07 +0100 Subject: [PATCH 25/40] cleanup --- dl_bench/llm.py | 4 ++-- dl_bench/utils.py | 2 -- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/dl_bench/llm.py b/dl_bench/llm.py index 5cf426f..71b53e5 100644 --- a/dl_bench/llm.py +++ b/dl_bench/llm.py @@ -13,7 +13,7 @@ def get_llm(name, dtype): model_name = "EleutherAI/gpt-j-6B" - model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=dtype, torchscript=True) + model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=dtype) tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B") return tokenizer, model @@ -61,7 +61,7 @@ def inference(self, backend): self.generate(self.warmup_prompt, backend) print("Warmup done") - # self.model.eval() + self.model.eval() enabled = backend.dtype != torch.float32 with torch.inference_mode(), torch.autocast( enabled=enabled, device_type=backend.device_name diff --git a/dl_bench/utils.py b/dl_bench/utils.py index 46b2386..875ea99 100644 --- a/dl_bench/utils.py +++ b/dl_bench/utils.py @@ -145,7 +145,6 @@ def __init__(self, device, compiler, dtype="float32") -> None: self.dtype = str_to_dtype(dtype) def to_device(self, x: torch.Tensor): - x = x.contiguous(memory_format=torch.channels_last) if self.device_name in ("cuda", "xpu"): return x.to(self.device) elif self.device_name == "cpu": @@ -235,7 +234,6 @@ def _compile_model(compile_mode: str, device, model: Module, sample_input, dtype # enable oneDNN graph fusion globally torch.jit.enable_onednn_fusion(True) compiled_model = torch.jit.trace(model, sample_input) - compiled_model = torch.jit.freeze(compiled_model) print("Compiled with torchscript onednn") elif compile_mode == "ipex": From d3e3b3c7ed6838904df0c91232ab87adc87430aa Mon Sep 17 00:00:00 2001 From: Egor Krivov Date: Thu, 15 Feb 2024 14:05:42 +0100 Subject: [PATCH 26/40] updated margin --- scripts/margin.sh | 3 ++- scripts/margin_setup.sh | 15 ++++++++++++++- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/scripts/margin.sh b/scripts/margin.sh index abbeced..fa1c0d9 100755 --- a/scripts/margin.sh +++ b/scripts/margin.sh @@ -5,7 +5,8 @@ set -x HOST="test" export KMP_AFFINITY="respect,noreset,granularity=fine,balanced" -export OMP_NUM_THREADS=32 +export OMP_NUM_THREADS=$(grep ^cpu\\scores /proc/cpuinfo | uniq | awk '{print $4}') +echo "Cores configured $OMP_NUM_THREADS" export ONEDNN_VERBOSE=0 CNNS=(resnet50) diff --git a/scripts/margin_setup.sh b/scripts/margin_setup.sh index b3411b9..c21662b 100644 --- a/scripts/margin_setup.sh +++ b/scripts/margin_setup.sh @@ -1,5 +1,18 @@ +# install miniconda wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh sh -y ./Miniconda3-latest-Linux-x86_64.sh # get github repo -conda install gh -c conda-forge --solver libmamba \ No newline at end of file +conda install gh -c conda-forge --solver libmamba + +# set up env +conda create -y -n ipex python=3.11 +conda activate ipex +# Install ipex & pytorch +python -m pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu +python -m pip install intel-extension-for-pytorch +python -m pip install oneccl_bind_pt --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/cpu/us/ + +# Install benchmarks +pip install -e . + From e6355d57b6ec7ce81c9d12bf67a321337e56e7e6 Mon Sep 17 00:00:00 2001 From: Egor Krivov Date: Thu, 15 Feb 2024 13:08:29 +0000 Subject: [PATCH 27/40] update --- scripts/margin.sh | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/scripts/margin.sh b/scripts/margin.sh index fa1c0d9..248495c 100755 --- a/scripts/margin.sh +++ b/scripts/margin.sh @@ -19,8 +19,23 @@ do for name in "${CNNS[@]}" do echo "Benchmark $name with BS=$BS and DTYPE=$DTYPE" - numactl -N 1 benchmark-run -b cnn -p "name='${name}',batch_size='$BS'" --dtype "${DTYPE}" --benchmark_desc "${name}_bs$BS" --host "${HOST}" -c "${COMPILER}" --verbose --skip_verification + numactl -N 1 benchmark-run -b cnn -p "name='${name}',batch_size='$BS'" --dtype "${DTYPE}" --benchmark_desc "${name}_bs$BS" --host "${HOST}" -c "${COMPILER}" --skip_verification done done done done + + +LLMS=(gptj) +for COMPILER in dynamo ipex +do + for DTYPE in float32 bfloat16 + do + for name in "${LMMS[@]}" + do + echo "Benchmark $name with DTYPE=$DTYPE" + numactl -N 1 benchmark-run -b llm -p "name='${name}'" --dtype "${DTYPE}" --benchmark_desc "${name}_bs$BS" --host "${HOST}" -c "${COMPILER}" --skip_verification + done + done +done + From 7c28f274046e19185fd14c665ff3878d32d7ce74 Mon Sep 17 00:00:00 2001 From: Egor Krivov Date: Thu, 15 Feb 2024 17:49:45 +0100 Subject: [PATCH 28/40] Update --- scripts/margin.sh | 10 +++++----- scripts/margin_setup.sh | 10 +++++++--- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/scripts/margin.sh b/scripts/margin.sh index 248495c..932c7a6 100755 --- a/scripts/margin.sh +++ b/scripts/margin.sh @@ -18,8 +18,8 @@ do do for name in "${CNNS[@]}" do - echo "Benchmark $name with BS=$BS and DTYPE=$DTYPE" - numactl -N 1 benchmark-run -b cnn -p "name='${name}',batch_size='$BS'" --dtype "${DTYPE}" --benchmark_desc "${name}_bs$BS" --host "${HOST}" -c "${COMPILER}" --skip_verification + echo "Benchmark $name with BS=$BS and DTYPE=$DTYPE" + numactl -N 1 benchmark-run -b cnn -p "name='${name}',batch_size='$BS'" --dtype "${DTYPE}" --benchmark_desc "${name}_bs$BS" --host "${HOST}" -c "${COMPILER}" --skip_verification | true done done done @@ -31,10 +31,10 @@ for COMPILER in dynamo ipex do for DTYPE in float32 bfloat16 do - for name in "${LMMS[@]}" + for name in "${LLMS[@]}" do - echo "Benchmark $name with DTYPE=$DTYPE" - numactl -N 1 benchmark-run -b llm -p "name='${name}'" --dtype "${DTYPE}" --benchmark_desc "${name}_bs$BS" --host "${HOST}" -c "${COMPILER}" --skip_verification + echo "Benchmark $name with DTYPE=$DTYPE" + numactl -N 1 benchmark-run -b llm -p "name='${name}'" --dtype "${DTYPE}" --benchmark_desc "${name}_bs$BS" --host "${HOST}" -c "${COMPILER}" --skip_verification done done done diff --git a/scripts/margin_setup.sh b/scripts/margin_setup.sh index c21662b..9ca50b2 100644 --- a/scripts/margin_setup.sh +++ b/scripts/margin_setup.sh @@ -1,3 +1,8 @@ +#!/bin/bash + +# We expect to have this repo present and this script run as +# ./scripts/margin.sh + # install miniconda wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh sh -y ./Miniconda3-latest-Linux-x86_64.sh @@ -6,8 +11,8 @@ sh -y ./Miniconda3-latest-Linux-x86_64.sh conda install gh -c conda-forge --solver libmamba # set up env -conda create -y -n ipex python=3.11 -conda activate ipex +conda create -y -n margin python=3.11 +conda activate margin # Install ipex & pytorch python -m pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu python -m pip install intel-extension-for-pytorch @@ -15,4 +20,3 @@ python -m pip install oneccl_bind_pt --extra-index-url https://pytorch-extension # Install benchmarks pip install -e . - From d085e41c12c06f3d6803f8e5dc19ce8f9e2cad16 Mon Sep 17 00:00:00 2001 From: Egor Krivov Date: Thu, 15 Feb 2024 18:42:46 +0100 Subject: [PATCH 29/40] update setup --- scripts/margin_setup.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/margin_setup.sh b/scripts/margin_setup.sh index 9ca50b2..0fb7948 100644 --- a/scripts/margin_setup.sh +++ b/scripts/margin_setup.sh @@ -5,10 +5,10 @@ # install miniconda wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -sh -y ./Miniconda3-latest-Linux-x86_64.sh - -# get github repo -conda install gh -c conda-forge --solver libmamba +curl -o Miniconda3-latest-Linux-x86_64.sh -L https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \ + sh Miniconda3-latest-Linux-x86_64.sh -u -b -p ./miniconda && \ + rm -f Miniconda3-latest-Linux-x86_64.sh +source ./miniconda/bin/activate # set up env conda create -y -n margin python=3.11 From 6ad377ea76700e5bbc409a6dd7810246f43aa1a5 Mon Sep 17 00:00:00 2001 From: Egor Krivov Date: Thu, 15 Feb 2024 18:43:30 +0100 Subject: [PATCH 30/40] added iteration --- scripts/margin.sh | 42 ++++++++++++++++++++++-------------------- 1 file changed, 22 insertions(+), 20 deletions(-) diff --git a/scripts/margin.sh b/scripts/margin.sh index 932c7a6..347e862 100755 --- a/scripts/margin.sh +++ b/scripts/margin.sh @@ -9,33 +9,35 @@ export OMP_NUM_THREADS=$(grep ^cpu\\scores /proc/cpuinfo | uniq | awk '{print $ echo "Cores configured $OMP_NUM_THREADS" export ONEDNN_VERBOSE=0 -CNNS=(resnet50) -for COMPILER in dynamo ipex_onednn_graph +for i in 1 2 3 4 5 6 7 do - for DTYPE in float32 bfloat16 + CNNS=(resnet50) + for COMPILER in dynamo ipex_onednn_graph do - for BS in 0001 0032 0128 - do - for name in "${CNNS[@]}" - do - echo "Benchmark $name with BS=$BS and DTYPE=$DTYPE" - numactl -N 1 benchmark-run -b cnn -p "name='${name}',batch_size='$BS'" --dtype "${DTYPE}" --benchmark_desc "${name}_bs$BS" --host "${HOST}" -c "${COMPILER}" --skip_verification | true - done - done + for DTYPE in float32 bfloat16 + do + for BS in 0001 0032 0128 + do + for name in "${CNNS[@]}" + do + echo "Benchmark $name with BS=$BS and DTYPE=$DTYPE" + numactl -N 1 benchmark-run -b cnn -p "name='${name}',batch_size='$BS'" --dtype "${DTYPE}" --benchmark_desc "${name}_bs$BS" --host "${HOST}" -c "${COMPILER}" --skip_verification | true + done + done + done done -done -LLMS=(gptj) -for COMPILER in dynamo ipex -do - for DTYPE in float32 bfloat16 + LLMS=(gptj) + for COMPILER in dynamo ipex do - for name in "${LLMS[@]}" + for DTYPE in float32 bfloat16 do - echo "Benchmark $name with DTYPE=$DTYPE" - numactl -N 1 benchmark-run -b llm -p "name='${name}'" --dtype "${DTYPE}" --benchmark_desc "${name}_bs$BS" --host "${HOST}" -c "${COMPILER}" --skip_verification + for name in "${LLMS[@]}" + do + echo "Benchmark $name with DTYPE=$DTYPE" + numactl -N 1 benchmark-run -b llm -p "name='${name}'" --dtype "${DTYPE}" --benchmark_desc "${name}_bs$BS" --host "${HOST}" -c "${COMPILER}" --skip_verification + done done done done - From 7f76f32df80ee54ab3a63b9b1caaec5fd2c1671d Mon Sep 17 00:00:00 2001 From: Egor Krivov Date: Thu, 15 Feb 2024 18:44:26 +0100 Subject: [PATCH 31/40] fixed bug --- scripts/margin.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/margin.sh b/scripts/margin.sh index 347e862..d1dcdcb 100755 --- a/scripts/margin.sh +++ b/scripts/margin.sh @@ -21,7 +21,7 @@ do for name in "${CNNS[@]}" do echo "Benchmark $name with BS=$BS and DTYPE=$DTYPE" - numactl -N 1 benchmark-run -b cnn -p "name='${name}',batch_size='$BS'" --dtype "${DTYPE}" --benchmark_desc "${name}_bs$BS" --host "${HOST}" -c "${COMPILER}" --skip_verification | true + numactl -N 1 benchmark-run -b cnn -p "name='${name}',batch_size='$BS'" --dtype "${DTYPE}" --benchmark_desc "${name}_bs$BS" --host "${HOST}" -c "${COMPILER}" --skip_verification done done done From dfd38aa0a5ef87ed3c883c4dd5a418126317e8d0 Mon Sep 17 00:00:00 2001 From: Egor Krivov Date: Fri, 23 Feb 2024 12:57:29 +0000 Subject: [PATCH 32/40] fixed ipex issue --- dl_bench/llm.py | 7 ++++--- dl_bench/utils.py | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/dl_bench/llm.py b/dl_bench/llm.py index fb3873b..52082c0 100644 --- a/dl_bench/llm.py +++ b/dl_bench/llm.py @@ -3,7 +3,7 @@ import math import torch -import intel_extension_for_pytorch as ipex +# import intel_extension_for_pytorch as ipex import numpy as np from transformers import ( AutoModelForCausalLM, @@ -76,14 +76,15 @@ def inference(self, backend): # self.flops_per_sample = get_macs(self.model, self.in_shape, backend) * 2 self.model = backend.prepare_eval_transformer(self.model) - self.model.eval() enabled = backend.dtype != torch.float32 n_items = 0 outputs = [] fw_times = [] - self.model.eval() + + # Ipex gives error with eval, other backends have no effect + # self.model.eval() for i in range(self.n_iter): print(f"Epoch {i+1}/{self.n_iter}") cast = torch.autocast(enabled=enabled, device_type=backend.device_name) diff --git a/dl_bench/utils.py b/dl_bench/utils.py index 2711843..e269fe7 100644 --- a/dl_bench/utils.py +++ b/dl_bench/utils.py @@ -132,7 +132,7 @@ def prepare_eval_transformer(self, model): model = model.to(memory_format=torch.channels_last) model.to(self.device) - with torch.inference_mode(): + with torch.no_grad(): model.eval() return self._compile_transformer_model( self.compile_mode, model, dtype=self.dtype From f8b8f28ae3839bedad3491cd1fde8624726def91 Mon Sep 17 00:00:00 2001 From: Egor Krivov Date: Fri, 23 Feb 2024 12:57:54 +0000 Subject: [PATCH 33/40] impoved ipex --- dl_bench/llm.py | 1 - scripts/margin.sh | 19 +++++++++++-------- scripts/margin_setup.sh | 1 + 3 files changed, 12 insertions(+), 9 deletions(-) mode change 100644 => 100755 scripts/margin_setup.sh diff --git a/dl_bench/llm.py b/dl_bench/llm.py index 52082c0..8bcc38b 100644 --- a/dl_bench/llm.py +++ b/dl_bench/llm.py @@ -3,7 +3,6 @@ import math import torch -# import intel_extension_for_pytorch as ipex import numpy as np from transformers import ( AutoModelForCausalLM, diff --git a/scripts/margin.sh b/scripts/margin.sh index d1dcdcb..87faed7 100755 --- a/scripts/margin.sh +++ b/scripts/margin.sh @@ -12,11 +12,11 @@ export ONEDNN_VERBOSE=0 for i in 1 2 3 4 5 6 7 do CNNS=(resnet50) - for COMPILER in dynamo ipex_onednn_graph + for COMPILER in ipex_onednn_graph do - for DTYPE in float32 bfloat16 + for DTYPE in bfloat16 do - for BS in 0001 0032 0128 + for BS in 0001 0016 0032 0064 0128 do for name in "${CNNS[@]}" do @@ -28,15 +28,18 @@ do done - LLMS=(gptj) + LLMS=(gptj llama2-7b) for COMPILER in dynamo ipex do - for DTYPE in float32 bfloat16 + for BS in 0001 0004 0008 do - for name in "${LLMS[@]}" + for DTYPE in bfloat16 do - echo "Benchmark $name with DTYPE=$DTYPE" - numactl -N 1 benchmark-run -b llm -p "name='${name}'" --dtype "${DTYPE}" --benchmark_desc "${name}_bs$BS" --host "${HOST}" -c "${COMPILER}" --skip_verification + for name in "${LLMS[@]}" + do + echo "Benchmark $name with DTYPE=$DTYPE" + numactl -N 1 benchmark-run -b llm -p "name='${name}',batch_size=${BS}" --dtype "${DTYPE}" --benchmark_desc "${name}_bs$BS" --host "${HOST}" -c "${COMPILER}" --skip_verification + done done done done diff --git a/scripts/margin_setup.sh b/scripts/margin_setup.sh old mode 100644 new mode 100755 index 0fb7948..3482f57 --- a/scripts/margin_setup.sh +++ b/scripts/margin_setup.sh @@ -16,6 +16,7 @@ conda activate margin # Install ipex & pytorch python -m pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu python -m pip install intel-extension-for-pytorch +python -m pip install transformers==4.35.2 python -m pip install oneccl_bind_pt --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/cpu/us/ # Install benchmarks From 4849023452cebc6198eec8b5687ad30df9c7210b Mon Sep 17 00:00:00 2001 From: Egor Krivov Date: Fri, 23 Feb 2024 13:02:48 +0000 Subject: [PATCH 34/40] removed accidental file --- dl_bench/results.db | Bin 8192 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 dl_bench/results.db diff --git a/dl_bench/results.db b/dl_bench/results.db deleted file mode 100644 index 064d103a8d029f1d275998ec039bdd255aaf11d3..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 8192 zcmeI0%WB&|6oyA8U}6Xjq$aDvAPBL+DY~>FID|%Vl+>+kr?T9V)pRV4V>OZ`q;X4I z3N2)jZ6Bae=(fu~Rp|?~tB!^dY<`;O5$sn4zA^b9{W?AAlW zu4#%gpZt4O;<41df$s(u64Qi+DSp)Nb?0s}r0zMJc-I^?j#9q(6py|1UA8#yMq%s+ zELrUE`0MzJ4PMbCoW>-@M^?`e?EmLKO(^%HNntXjVH~iO#4IVykAf(kWpii2{wH?3 zW7wzov3ZJXzE`i*^Vf00e*l5C8%|00;m9AOHk_01&vEz=v&V73tN^ z+Oys2+WMVxxxAz)n!crKTgnbrm8PaPl^xDf9tJU)(2(tIrv4jtf3q>9+#Qo_vZpHA zJ&}JaX9l^6<=i4Su>8*?m7HTg3(H{Kjy{L-&@7I5R T`|(-ymtWq0yh!AYueZ@};5GDw From b0e39c4c24433a5d51441e21bf95493eee298a6c Mon Sep 17 00:00:00 2001 From: Egor Krivov Date: Fri, 23 Feb 2024 13:22:15 +0000 Subject: [PATCH 35/40] update --- scripts/margin.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/margin.sh b/scripts/margin.sh index 87faed7..8c274f1 100755 --- a/scripts/margin.sh +++ b/scripts/margin.sh @@ -29,7 +29,7 @@ do LLMS=(gptj llama2-7b) - for COMPILER in dynamo ipex + for COMPILER in ipex do for BS in 0001 0004 0008 do From 41d4661ae9fff6386198a5caa258fd3c16d99a49 Mon Sep 17 00:00:00 2001 From: Egor Krivov Date: Fri, 23 Feb 2024 14:32:09 +0000 Subject: [PATCH 36/40] fixed bugs --- dl_bench/llm.py | 2 +- scripts/margin.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/dl_bench/llm.py b/dl_bench/llm.py index 8bcc38b..5972804 100644 --- a/dl_bench/llm.py +++ b/dl_bench/llm.py @@ -26,7 +26,7 @@ def get_llm(name, dtype): kwargs = {} if name.startswith("llama2") and "HF_TOKEN" in os.environ: - kwargs = {"HF_TOKEN": os.environ.get("HF_TOKEN")} + kwargs = {"token": os.environ.get("HF_TOKEN")} model_name, M, T = name2params[name] diff --git a/scripts/margin.sh b/scripts/margin.sh index 8c274f1..15d6eab 100755 --- a/scripts/margin.sh +++ b/scripts/margin.sh @@ -31,7 +31,7 @@ do LLMS=(gptj llama2-7b) for COMPILER in ipex do - for BS in 0001 0004 0008 + for BS in 1 4 8 do for DTYPE in bfloat16 do From 98ea94c04767c58ef0d816a3c9f2ffe9a1ff7400 Mon Sep 17 00:00:00 2001 From: Egor Date: Mon, 26 Feb 2024 14:04:08 +0100 Subject: [PATCH 37/40] Changed ipex from ipex-xpu -> ipex-cpu (#90) --- tests/conda-envs/ipex-xpu.yaml | 14 ++++++++++++++ tests/conda-envs/ipex.yaml | 14 +++++++++++--- 2 files changed, 25 insertions(+), 3 deletions(-) create mode 100644 tests/conda-envs/ipex-xpu.yaml diff --git a/tests/conda-envs/ipex-xpu.yaml b/tests/conda-envs/ipex-xpu.yaml new file mode 100644 index 0000000..d136704 --- /dev/null +++ b/tests/conda-envs/ipex-xpu.yaml @@ -0,0 +1,14 @@ +name: ipex +channels: + - intel + - conda-forge +dependencies: + - intel-aikit-pytorch + - pytorch>=2.0.1=*_xpu_* + - intel-extension-for-pytorch + - datasets + - accelerate + - sentencepiece +# The following packages are required to run benchmarks + - sqlalchemy>=2.0.0 + - pytest diff --git a/tests/conda-envs/ipex.yaml b/tests/conda-envs/ipex.yaml index d136704..9691bde 100644 --- a/tests/conda-envs/ipex.yaml +++ b/tests/conda-envs/ipex.yaml @@ -3,12 +3,20 @@ channels: - intel - conda-forge dependencies: - - intel-aikit-pytorch - - pytorch>=2.0.1=*_xpu_* - - intel-extension-for-pytorch + - python=3.11 - datasets - accelerate - sentencepiece # The following packages are required to run benchmarks - sqlalchemy>=2.0.0 - pytest + - pip + - pip: + - --extra-index-url https://download.pytorch.org/whl/cpu + - torch + - torchvision + - torchaudio + - transformers==4.35.2 + - intel-extension-for-pytorch + - --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/cpu/us/ + - oneccl_bind_pt From a6a39a6f84dd3f80ddd8cc4c051986f2c5561844 Mon Sep 17 00:00:00 2001 From: Egor Krivov Date: Tue, 27 Feb 2024 16:51:23 +0000 Subject: [PATCH 38/40] updated exp --- scripts/margin.sh | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/scripts/margin.sh b/scripts/margin.sh index 15d6eab..d5c9138 100755 --- a/scripts/margin.sh +++ b/scripts/margin.sh @@ -14,14 +14,14 @@ do CNNS=(resnet50) for COMPILER in ipex_onednn_graph do - for DTYPE in bfloat16 + for DTYPE in float32 bfloat16 do - for BS in 0001 0016 0032 0064 0128 + for BS in 0001 0032 do for name in "${CNNS[@]}" do echo "Benchmark $name with BS=$BS and DTYPE=$DTYPE" - numactl -N 1 benchmark-run -b cnn -p "name='${name}',batch_size='$BS'" --dtype "${DTYPE}" --benchmark_desc "${name}_bs$BS" --host "${HOST}" -c "${COMPILER}" --skip_verification + numactl -m 0 --physcpubind=0-31 benchmark-run -b cnn -p "name='${name}',batch_size='$BS'" --dtype "${DTYPE}" --benchmark_desc "${name}_bs$BS" --host "${HOST}" -c "${COMPILER}" --skip_verification done done done @@ -31,14 +31,14 @@ do LLMS=(gptj llama2-7b) for COMPILER in ipex do - for BS in 1 4 8 + for BS in 1 8 do for DTYPE in bfloat16 do for name in "${LLMS[@]}" do echo "Benchmark $name with DTYPE=$DTYPE" - numactl -N 1 benchmark-run -b llm -p "name='${name}',batch_size=${BS}" --dtype "${DTYPE}" --benchmark_desc "${name}_bs$BS" --host "${HOST}" -c "${COMPILER}" --skip_verification + numactl -m 0 --physcpubind=0-31 benchmark-run -b llm -p "name='${name}',batch_size=${BS}" --dtype "${DTYPE}" --benchmark_desc "${name}_bs$BS" --host "${HOST}" -c "${COMPILER}" --skip_verification done done done From e983d51753bced1f428a7bdaf151e6f538e48587 Mon Sep 17 00:00:00 2001 From: Egor Krivov Date: Wed, 28 Feb 2024 14:45:56 +0000 Subject: [PATCH 39/40] added each4 benchmarkx --- scripts/margin2.sh | 62 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) create mode 100755 scripts/margin2.sh diff --git a/scripts/margin2.sh b/scripts/margin2.sh new file mode 100755 index 0000000..87efd63 --- /dev/null +++ b/scripts/margin2.sh @@ -0,0 +1,62 @@ +#!/bin/bash + +set -x + +HOST="test" + +export KMP_AFFINITY="respect,noreset,granularity=fine,balanced" +export OMP_NUM_THREADS=4 +echo "Cores configured $OMP_NUM_THREADS" +export ONEDNN_VERBOSE=0 + +for i in 1 2 3 4 5 6 7 +do + CNNS=(resnet50) + for COMPILER in ipex_onednn_graph + do + for DTYPE in float32 bfloat16 + do + for BS in 0001 + do + for name in "${CNNS[@]}" + do + echo "Benchmark $name with BS=$BS and DTYPE=$DTYPE" + export BENCH_COMMAND="benchmark-run -b cnn -p name='${name}',batch_size='$BS' --dtype ${DTYPE} --benchmark_desc ${name}_bs${BS}each4 --host ${HOST} -c ${COMPILER} --skip_verification" + numactl -m 0 --physcpubind=4-7 $BENCH_COMMAND & + numactl -m 0 --physcpubind=8-11 $BENCH_COMMAND & + numactl -m 0 --physcpubind=12-15 $BENCH_COMMAND & + numactl -m 0 --physcpubind=16-19 $BENCH_COMMAND & + numactl -m 0 --physcpubind=20-23 $BENCH_COMMAND & + numactl -m 0 --physcpubind=24-27 $BENCH_COMMAND & + numactl -m 0 --physcpubind=28-31 $BENCH_COMMAND & + wait $(jobs -p) + done + done + done + done + + + LLMS=(gptj llama2-7b) + for COMPILER in ipex + do + for BS in 1 + do + for DTYPE in bfloat16 + do + for name in "${LLMS[@]}" + do + echo "Benchmark $name with DTYPE=$DTYPE" + export BENCH_COMMAND="benchmark-run -b llm -p name='${name}',batch_size='$BS' --dtype ${DTYPE} --benchmark_desc ${name}_bs${BS}each4 --host ${HOST} -c ${COMPILER} --skip_verification" + numactl -m 0 --physcpubind=4-7 $BENCH_COMMAND & + numactl -m 0 --physcpubind=8-11 $BENCH_COMMAND & + numactl -m 0 --physcpubind=12-15 $BENCH_COMMAND & + numactl -m 0 --physcpubind=16-19 $BENCH_COMMAND & + numactl -m 0 --physcpubind=20-23 $BENCH_COMMAND & + numactl -m 0 --physcpubind=24-27 $BENCH_COMMAND & + numactl -m 0 --physcpubind=28-31 $BENCH_COMMAND & + wait $(jobs -p) + done + done + done + done +done From 3aba74a85de4603a80922cb9dd08b775c9ec57db Mon Sep 17 00:00:00 2001 From: Egor Krivov Date: Wed, 28 Feb 2024 15:13:28 +0000 Subject: [PATCH 40/40] fixed bug --- scripts/margin2.sh | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/scripts/margin2.sh b/scripts/margin2.sh index 87efd63..76c6aee 100755 --- a/scripts/margin2.sh +++ b/scripts/margin2.sh @@ -22,6 +22,7 @@ do do echo "Benchmark $name with BS=$BS and DTYPE=$DTYPE" export BENCH_COMMAND="benchmark-run -b cnn -p name='${name}',batch_size='$BS' --dtype ${DTYPE} --benchmark_desc ${name}_bs${BS}each4 --host ${HOST} -c ${COMPILER} --skip_verification" + numactl -m 0 --physcpubind=0-3 $BENCH_COMMAND & numactl -m 0 --physcpubind=4-7 $BENCH_COMMAND & numactl -m 0 --physcpubind=8-11 $BENCH_COMMAND & numactl -m 0 --physcpubind=12-15 $BENCH_COMMAND & @@ -29,6 +30,12 @@ do numactl -m 0 --physcpubind=20-23 $BENCH_COMMAND & numactl -m 0 --physcpubind=24-27 $BENCH_COMMAND & numactl -m 0 --physcpubind=28-31 $BENCH_COMMAND & + numactl -m 0 --physcpubind=32-35 $BENCH_COMMAND & + numactl -m 0 --physcpubind=36-39 $BENCH_COMMAND & + numactl -m 0 --physcpubind=40-43 $BENCH_COMMAND & + numactl -m 0 --physcpubind=44-47 $BENCH_COMMAND & + numactl -m 0 --physcpubind=48-51 $BENCH_COMMAND & + numactl -m 0 --physcpubind=52-55 $BENCH_COMMAND & wait $(jobs -p) done done @@ -47,6 +54,7 @@ do do echo "Benchmark $name with DTYPE=$DTYPE" export BENCH_COMMAND="benchmark-run -b llm -p name='${name}',batch_size='$BS' --dtype ${DTYPE} --benchmark_desc ${name}_bs${BS}each4 --host ${HOST} -c ${COMPILER} --skip_verification" + numactl -m 0 --physcpubind=0-3 $BENCH_COMMAND & numactl -m 0 --physcpubind=4-7 $BENCH_COMMAND & numactl -m 0 --physcpubind=8-11 $BENCH_COMMAND & numactl -m 0 --physcpubind=12-15 $BENCH_COMMAND & @@ -54,6 +62,12 @@ do numactl -m 0 --physcpubind=20-23 $BENCH_COMMAND & numactl -m 0 --physcpubind=24-27 $BENCH_COMMAND & numactl -m 0 --physcpubind=28-31 $BENCH_COMMAND & + numactl -m 0 --physcpubind=32-35 $BENCH_COMMAND & + numactl -m 0 --physcpubind=36-39 $BENCH_COMMAND & + numactl -m 0 --physcpubind=40-43 $BENCH_COMMAND & + numactl -m 0 --physcpubind=44-47 $BENCH_COMMAND & + numactl -m 0 --physcpubind=48-51 $BENCH_COMMAND & + numactl -m 0 --physcpubind=52-55 $BENCH_COMMAND & wait $(jobs -p) done done