From 608a5ed7542b404dda373ab1b0c2c4937e9806af Mon Sep 17 00:00:00 2001
From: Egor Krivov <egor.krivov@intel.com>
Date: Tue, 9 Jan 2024 11:05:29 +0100
Subject: [PATCH 01/40] Added bfloat16 run

---
 cnn.sh | 15 +++++++++------
 mlp.sh | 29 +++++++++++++++--------------
 2 files changed, 24 insertions(+), 20 deletions(-)

diff --git a/cnn.sh b/cnn.sh
index 60ad577..83d5974 100755
--- a/cnn.sh
+++ b/cnn.sh
@@ -10,11 +10,14 @@ if [[ -z "${DL_BENCH_ARGS}" ]]; then
 fi
 
 CNNS=(vgg16 resnet18 resnet50 resnext50 resnext101 densenet121 mobilenet_v3l)
-for BS in 0001 0032 0128
+for DTYPE in float32 bfloat16
 do
-    for name in "${CNNS[@]}"
-    do
-        echo "Benchmark $name"
-        benchmark-run -b cnn -p "name='${name}',batch_size='$BS'" --benchmark_desc "${name}_bs$BS" ${DL_BENCH_ARGS} || echo Failed
-    done
+  for BS in 0001 0032 0128
+  do
+      for name in "${CNNS[@]}"
+      do
+          echo "Benchmark $name with BS=$BS and DTYPE=$DTYPE"
+          benchmark-run -b cnn -p "name='${name}',batch_size='$BS'" --dtype "${DTYPE}" --benchmark_desc "${name}_bs$BS" ${DL_BENCH_ARGS} || echo Failed
+      done
+  done
 done
diff --git a/mlp.sh b/mlp.sh
index e055d93..6db7dfa 100755
--- a/mlp.sh
+++ b/mlp.sh
@@ -9,21 +9,22 @@ if [[ -z "${DL_BENCH_ARGS}" ]]; then
   exit 1
 fi
 
-echo "Bfloat16 on size5"
-benchmark-run -b mlp -p "name='size5',batch_size=1024" --benchmark_desc "size5_bs1024_bfloat16" --dtype bfloat16 ${DL_BENCH_ARGS} || echo Failed
-
 # for size in size5_bn_gelu
-for size in size2 size3 size4 size5 size5_sigm size5_tanh size5_gelu size5_linear size5_inplace size5_bn size5_bn_gelu size5_drop_gelu 100@512 25@1024 4@16384 2@16384
+for DTYPE in float32 bfloat16
 do
-    echo "Benchmark $size"
-    benchmark-run -b mlp -p "name='${size}'" --benchmark_desc "${size}_bs1024" ${DL_BENCH_ARGS} || echo Failed
-done
+  echo "DTYPE=${DTYPE}"
+  for size in size2 size3 size4 size5 size5_sigm size5_tanh size5_gelu size5_linear size5_inplace size5_bn size5_bn_gelu size5_drop_gelu 100@512 25@1024 4@16384 2@16384
+  do
+      echo "Benchmark $size"
+      benchmark-run -b mlp -p "name='${size}'" --dtype "${DTYPE}" --benchmark_desc "${size}_bs1024" ${DL_BENCH_ARGS} || echo Failed
+  done
 
-size="size5"
-for BATCH_SIZE in 1 16 256 2048 8196
-do
-    echo "Batch size $BATCH_SIZE"
-    echo "Benchmark $size"
-    BATCH_SIZE_TXT=$(printf "%04d" $BATCH_SIZE)
-    benchmark-run -b mlp -p "name='${size}',batch_size=${BATCH_SIZE}" --benchmark_desc "${size}_bs${BATCH_SIZE_TXT}" ${DL_BENCH_ARGS} || echo Failed
+  size="size5"
+  for BATCH_SIZE in 1 16 256 2048 8196
+  do
+      echo "Batch size $BATCH_SIZE"
+      echo "Benchmark $size"
+      BATCH_SIZE_TXT=$(printf "%04d" $BATCH_SIZE)
+      benchmark-run -b mlp -p "name='${size}',batch_size=${BATCH_SIZE}" --dtype "${DTYPE}" --benchmark_desc "${size}_bs${BATCH_SIZE_TXT}" ${DL_BENCH_ARGS} || echo Failed
+  done
 done

From d5f9e187da0499787adb1c20b937846c53d1624b Mon Sep 17 00:00:00 2001
From: Egor Krivov <egor.krivov@intel.com>
Date: Tue, 9 Jan 2024 11:09:22 +0100
Subject: [PATCH 02/40] llm update

---
 llm.sh | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/llm.sh b/llm.sh
index 94fe3ae..13681e1 100755
--- a/llm.sh
+++ b/llm.sh
@@ -9,5 +9,7 @@ if [[ -z "${DL_BENCH_ARGS}" ]]; then
   exit 1
 fi
 
-benchmark-run -b llm -p "" --benchmark_desc "gptj" --dtype float32 ${DL_BENCH_ARGS} || echo Failed
-benchmark-run -b llm -p "" --benchmark_desc "gptj_bfloat16" --dtype bfloat16 ${DL_BENCH_ARGS} || echo Failed
+for DTYPE in float32 bfloat16
+do
+  benchmark-run -b llm -p "" --benchmark_desc "gptj" --dtype "${DTYPE}" ${DL_BENCH_ARGS} || echo Failed
+done

From 8c397b107121e1b26a6a85a73291d121b1b9fb10 Mon Sep 17 00:00:00 2001
From: Egor Krivov <egor.krivov@intel.com>
Date: Tue, 9 Jan 2024 13:54:34 +0100
Subject: [PATCH 03/40] fixed

---
 dl_bench/utils.py | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/dl_bench/utils.py b/dl_bench/utils.py
index af2d41d..4561573 100644
--- a/dl_bench/utils.py
+++ b/dl_bench/utils.py
@@ -375,9 +375,14 @@ def inference(self, backend: Backend):
         self.compile(sample, backend)
 
         print("Warmup started")
+        enabled = not (backend.dtype == torch.float32)
         with torch.no_grad():
             self.net.eval()
-            with tm.timeit("warmup_s"):
+            with tm.timeit("warmup_s"), torch.autocast(
+                enabled=enabled,
+                device_type=backend.device_name,
+                dtype=backend.dtype,
+            ):
                 sample = backend.to_device(sample)
                 self.net(sample)
                 self.net(sample)
@@ -396,13 +401,11 @@ def inference(self, backend: Backend):
                 for i, x in enumerate(test_loader):
                     s = get_time()
                     x = backend.to_device(x)
-                    if backend.dtype != torch.float32:
-                        with torch.autocast(
-                            device_type=backend.device_name,
-                            dtype=backend.dtype,
-                        ):
-                            y = self.net(x)
-                    else:
+                    with torch.autocast(
+                        enabled=enabled,
+                        device_type=backend.device_name,
+                        dtype=backend.dtype,
+                    ):
                         y = self.net(x)
 
                     fw_times.append(get_time() - s)

From 0ebd42f95e57eae2ce6314a15506c3fe7619f87c Mon Sep 17 00:00:00 2001
From: Egor Krivov <egor.krivov@intel.com>
Date: Tue, 9 Jan 2024 15:56:30 +0100
Subject: [PATCH 04/40] ipex activated

---
 .github/workflows/test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 8db6c53..84eb363 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -38,7 +38,7 @@ jobs:
                   {device: 'cpu', compiler: 'torchscript'},
                   {device: 'cpu', compiler: 'torchscript_onednn'},
                   {device: 'cpu', compiler: 'ipex'},
-                #   {device: 'cpu', compiler: 'ipex_onednn_graph'},
+                  {device: 'cpu', compiler: 'ipex_onednn_graph'},
 #                  {device: 'xpu', compiler: 'ipex'},
                   {device: 'cpu', compiler: 'torch_mlir'}
                 ]

From d26ad5de18b4ced17effb449a553f5f105fd4819 Mon Sep 17 00:00:00 2001
From: Egor Krivov <egor.krivov@intel.com>
Date: Tue, 9 Jan 2024 16:47:53 +0100
Subject: [PATCH 05/40] reverted cm

---
 dl_bench/utils.py | 31 ++++++++++++++++++-------------
 1 file changed, 18 insertions(+), 13 deletions(-)

diff --git a/dl_bench/utils.py b/dl_bench/utils.py
index 4561573..0e38307 100644
--- a/dl_bench/utils.py
+++ b/dl_bench/utils.py
@@ -375,15 +375,18 @@ def inference(self, backend: Backend):
         self.compile(sample, backend)
 
         print("Warmup started")
-        enabled = not (backend.dtype == torch.float32)
-        with torch.no_grad():
+        with torch.no_grad(), tm.timeit("warmup_s"):
             self.net.eval()
-            with tm.timeit("warmup_s"), torch.autocast(
-                enabled=enabled,
-                device_type=backend.device_name,
-                dtype=backend.dtype,
-            ):
-                sample = backend.to_device(sample)
+            sample = backend.to_device(sample)
+            if backend.dtype != torch.float32:
+                with torch.autocast(
+                    device_type=backend.device_name,
+                    dtype=backend.dtype,
+                ):
+                    self.net(sample)
+                    self.net(sample)
+                    self.net(sample)
+            else:
                 self.net(sample)
                 self.net(sample)
                 self.net(sample)
@@ -401,11 +404,13 @@ def inference(self, backend: Backend):
                 for i, x in enumerate(test_loader):
                     s = get_time()
                     x = backend.to_device(x)
-                    with torch.autocast(
-                        enabled=enabled,
-                        device_type=backend.device_name,
-                        dtype=backend.dtype,
-                    ):
+                    if backend.dtype != torch.float32:
+                        with torch.autocast(
+                            device_type=backend.device_name,
+                            dtype=backend.dtype,
+                        ):
+                            y = self.net(x)
+                    else:
                         y = self.net(x)
 
                     fw_times.append(get_time() - s)

From 5c058a9dc44a1549d203d1cc55fae8eba26e7a94 Mon Sep 17 00:00:00 2001
From: Egor Krivov <egor.krivov@intel.com>
Date: Tue, 6 Feb 2024 17:02:52 +0100
Subject: [PATCH 06/40] first full exp

---
 scripts/margin.sh | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)
 create mode 100644 scripts/margin.sh

diff --git a/scripts/margin.sh b/scripts/margin.sh
new file mode 100644
index 0000000..9bb452e
--- /dev/null
+++ b/scripts/margin.sh
@@ -0,0 +1,28 @@
+#!/bin/sh
+
+set -x
+
+HOST="test"
+
+export KMP_AFFINITY="respect,noreset,granularity=fine,balanced"
+export OMP_NUM_THREADS=32
+export ONEDNN_VERBOSE=0
+
+if [[ -z "${DL_BENCH_ARGS}" ]]; then
+  echo "Please, provide DL_BENCH_ARGS environment variable"
+  exit 1
+fi
+
+CNNS=(resnet50)
+for COMPILER in dynamo ipex_onednn_graph
+for DTYPE in float32 bfloat16
+do
+  for BS in 0001 0032 0128
+  do
+      for name in "${CNNS[@]}"
+      do
+          echo "Benchmark $name with BS=$BS and DTYPE=$DTYPE"
+          numactl -N 1 benchmark-run -b cnn -p "name='${name}',batch_size='$BS'" --dtype "${DTYPE}" --benchmark_desc "${name}_bs$BS" --host "${HOST}" -c "${COMPILER}"
+      done
+  done
+done

From 89cf5dabd5179bcb4eb58025353c5c4077da2f8b Mon Sep 17 00:00:00 2001
From: Gregory Shimansky <gshimansky@users.noreply.github.com>
Date: Fri, 19 Jan 2024 10:41:13 -0600
Subject: [PATCH 07/40] Added xsmm backend for torch_mlir compiler (#70)

Signed-off-by: Gregory Shimansky <gshimansky@gmail.com>
---
 .github/workflows/test-single-config.yml |  1 +
 .github/workflows/test.yml               | 12 ++----------
 dl_bench/cli/launcher.py                 |  1 +
 dl_bench/utils.py                        |  7 +++++--
 4 files changed, 9 insertions(+), 12 deletions(-)

diff --git a/.github/workflows/test-single-config.yml b/.github/workflows/test-single-config.yml
index da7c9d2..ea2d1de 100644
--- a/.github/workflows/test-single-config.yml
+++ b/.github/workflows/test-single-config.yml
@@ -21,6 +21,7 @@ on:
                     - torch
                     - dynamo
                     - torch_mlir
+                    - torch_mlir_xsmm
                     - torchscript
                     - torchscript_onednn
                     - ipex
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 84eb363..7a5f307 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -40,7 +40,8 @@ jobs:
                   {device: 'cpu', compiler: 'ipex'},
                   {device: 'cpu', compiler: 'ipex_onednn_graph'},
 #                  {device: 'xpu', compiler: 'ipex'},
-                  {device: 'cpu', compiler: 'torch_mlir'}
+                  {device: 'cpu', compiler: 'torch_mlir'},
+                  {device: 'cpu', compiler: 'torch_mlir_xsmm'}
                 ]
                 test_script: ${{ fromJson(inputs.test_scripts) }}
             fail-fast: false
@@ -56,12 +57,3 @@ jobs:
             test_script: ${{ matrix.test_script }}
         secrets:
             DB_URL: ${{ secrets.DB_URL }}
-
-    shutdown:
-        needs: mlp_test
-        if: ${{ contains(inputs.runner_type, 'amd') }} && inputs.shutdown_cloud_runner
-        runs-on: ${{ inputs.runner_type }}
-        steps:
-            - name: shutdown
-              shell: bash -el {0}
-              run: sudo shutdown -h +2
diff --git a/dl_bench/cli/launcher.py b/dl_bench/cli/launcher.py
index eabcc3b..6115496 100644
--- a/dl_bench/cli/launcher.py
+++ b/dl_bench/cli/launcher.py
@@ -85,6 +85,7 @@ def parse_args():
             "ipex",
             "ipex_onednn_graph",
             "torch_mlir",
+            "torch_mlir_xsmm",
         ],
         help="Compilation mode to use. No compilation by default.",
     )
diff --git a/dl_bench/utils.py b/dl_bench/utils.py
index 0e38307..b0c9d8b 100644
--- a/dl_bench/utils.py
+++ b/dl_bench/utils.py
@@ -268,7 +268,7 @@ def _compile_model(compile_mode: str, device, model: Module, sample_input, dtype
 
             compiled_model = dynamo.optimize(be.refbackend_torchdynamo_backend)(model)
             print("Compiled with torch_mlir (torchscript, inference)")
-        elif compile_mode == "torch_mlir":
+        elif compile_mode == "torch_mlir" or compile_mode == "torch_mlir_xsmm":
             from torch_mlir._dynamo_fx_importer import import_fx_graph_as_func
             from torch_mlir_e2e_test.configs.torchdynamo import jit
             from torch_mlir_e2e_test.framework import TestOptions
@@ -277,6 +277,9 @@ def _compile_model(compile_mode: str, device, model: Module, sample_input, dtype
             from torch_mlir_e2e_test.linalg_on_tensors_backends.cpuprotobackend import (
                 CpuProtoLinalgOnTensorsBackend,
             )
+            from torch_mlir_e2e_test.linalg_on_tensors_backends.xsmmprotobackend import (
+                XsmmProtoLinalgOnTensorsBackend,
+            )
             import torch.utils._pytree as pytree
 
             # debug_timer seems to cause problems:
@@ -290,7 +293,7 @@ def _compile_model(compile_mode: str, device, model: Module, sample_input, dtype
                 opts,
                 output_type="linalg-on-tensors",
             )
-            backend = CpuProtoLinalgOnTensorsBackend(opts)
+            backend = CpuProtoLinalgOnTensorsBackend(opts) if compile_mode == "torch_mlir" else XsmmProtoLinalgOnTensorsBackend(opts)
             # backend = RefBackendLinalgOnTensorsBackend()
             module = backend.compile(module)
             backend_module = backend.load(module)

From 4f7d5c969a84c1571c85bfe1ab564f6247b26794 Mon Sep 17 00:00:00 2001
From: Gregory Shimansky <gshimansky@users.noreply.github.com>
Date: Mon, 22 Jan 2024 10:36:13 -0600
Subject: [PATCH 08/40] Added weekly scheduled runs for all compilers (#72)

Signed-off-by: Gregory Shimansky <gshimansky@gmail.com>
---
 .github/workflows/test.yml | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 7a5f307..e542863 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -21,6 +21,9 @@ on:
                 required: false
                 default: '["./mlp.sh", "./cnn.sh", "./llm.sh"]'
                 type: string
+    schedule:
+        # Runs at 12pm UTC (6am CST) on every Saturday
+        - cron: "0 12 * * 6"
 
 jobs:
     print_inputs:
@@ -43,15 +46,15 @@ jobs:
                   {device: 'cpu', compiler: 'torch_mlir'},
                   {device: 'cpu', compiler: 'torch_mlir_xsmm'}
                 ]
-                test_script: ${{ fromJson(inputs.test_scripts) }}
+                test_script: ${{ github.event_name == 'workflow_dispatch' && fromJson(inputs.test_scripts) || fromJson('["./mlp.sh", "./cnn.sh", "./llm.sh"]') }}
             fail-fast: false
         uses: ./.github/workflows/execute-test-script.yml
         with:
             compiler: ${{ matrix.type.compiler }}
             device: ${{ matrix.type.device }}
-            tag: ${{ inputs.tag }}
-            torch_mlir_repo: ${{ inputs.torch_mlir_repo }}
-            torch_mlir_branch: ${{ inputs.torch_mlir_branch }}
+            tag: ${{ github.event_name == 'workflow_dispatch' && inputs.tag || 'ci' }}
+            torch_mlir_repo: ${{ github.event_name == 'workflow_dispatch' && inputs.torch_mlir_repo || 'intel-ai/torch-mlir' }}
+            torch_mlir_branch: ${{ github.event_name == 'workflow_dispatch' && inputs.torch_mlir_branch || 'cpu-proto' }}
             runner_type: spr
             shutdown_cloud_runner: false
             test_script: ${{ matrix.test_script }}

From 1e7731965238e1ae5e6d90d4117305ae21d77722 Mon Sep 17 00:00:00 2001
From: Egor <egor.krivov@intel.com>
Date: Thu, 8 Feb 2024 11:44:54 +0100
Subject: [PATCH 09/40] Set python==3.11 (#76)

---
 tests/conda-envs/cpu.yaml  | 1 +
 tests/conda-envs/cuda.yaml | 1 +
 tests/conda-envs/ipex.yaml | 1 +
 3 files changed, 3 insertions(+)

diff --git a/tests/conda-envs/cpu.yaml b/tests/conda-envs/cpu.yaml
index b769970..3d70190 100644
--- a/tests/conda-envs/cpu.yaml
+++ b/tests/conda-envs/cpu.yaml
@@ -2,6 +2,7 @@ name: cpu
 channels:
   - pytorch
 dependencies:
+  - python==3.11
   - pytorch
   - torchvision
   - torchaudio
diff --git a/tests/conda-envs/cuda.yaml b/tests/conda-envs/cuda.yaml
index 06c0afa..f83cfb4 100644
--- a/tests/conda-envs/cuda.yaml
+++ b/tests/conda-envs/cuda.yaml
@@ -3,6 +3,7 @@ channels:
   - pytorch
   - nvidia
 dependencies:
+  - python==3.11
   - pytorch
   - torchvision
   - torchaudio
diff --git a/tests/conda-envs/ipex.yaml b/tests/conda-envs/ipex.yaml
index d136704..d6d5b2d 100644
--- a/tests/conda-envs/ipex.yaml
+++ b/tests/conda-envs/ipex.yaml
@@ -3,6 +3,7 @@ channels:
   - intel
   - conda-forge
 dependencies:
+  - python==3.11
   - intel-aikit-pytorch
   - pytorch>=2.0.1=*_xpu_*
   - intel-extension-for-pytorch

From 700a97cc38af4163ec27658f6eec6cb7af42c283 Mon Sep 17 00:00:00 2001
From: Ivy Zhang <yan3.zhang@intel.com>
Date: Thu, 8 Feb 2024 21:11:35 +0800
Subject: [PATCH 10/40] skip 3 warmup steps in benchmarking (#75)

---
 dl_bench/mlp.py   | 5 ++++-
 dl_bench/utils.py | 4 +++-
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/dl_bench/mlp.py b/dl_bench/mlp.py
index 576198e..c05ddc2 100644
--- a/dl_bench/mlp.py
+++ b/dl_bench/mlp.py
@@ -87,7 +87,10 @@ def __init__(self, params) -> None:
 
         name = params.get("name", "size5")
         net = get_mlp(n_chans_in=IN_FEAT, n_chans_out=N_CLASSES, name=name)
+        min_batches = int(params.get("min_batches", 10))
+        min_seconds = int(params.get("min_seconds", 10))
 
         super().__init__(
-            net=net, in_shape=in_shape, dataset=dataset, batch_size=batch_size
+            net=net, in_shape=in_shape, dataset=dataset, batch_size=batch_size,\
+                min_batches=min_batches, min_seconds=min_seconds
         )
diff --git a/dl_bench/utils.py b/dl_bench/utils.py
index b0c9d8b..af9bba7 100644
--- a/dl_bench/utils.py
+++ b/dl_bench/utils.py
@@ -121,6 +121,8 @@ def str_to_dtype(dtype: str):
         return torch.float32
     elif dtype == "bfloat16":
         return torch.bfloat16
+    elif dtype == "int8":
+        return torch.qint8
     else:
         raise ValueError(f"Unsupported data type: {dtype}")
 
@@ -415,7 +417,7 @@ def inference(self, backend: Backend):
                             y = self.net(x)
                     else:
                         y = self.net(x)
-
+                    if i < 3: continue
                     fw_times.append(get_time() - s)
                     n_items += len(x)
                     outputs.append(y)

From 8398a93939c9f58cd6231ee9fcf60e4bd3c2befe Mon Sep 17 00:00:00 2001
From: Egor <egor.krivov@intel.com>
Date: Thu, 8 Feb 2024 14:23:09 +0100
Subject: [PATCH 11/40] Disabled background batch processing (#77)

---
 dl_bench/utils.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/dl_bench/utils.py b/dl_bench/utils.py
index af9bba7..28776e5 100644
--- a/dl_bench/utils.py
+++ b/dl_bench/utils.py
@@ -34,10 +34,10 @@ def get_inf_loaders(n, in_shape, batch_size, device: str):
 
     ds = RandomInfDataset(n, in_shape)
     train_loader = DataLoader(
-        ds, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=pin_memory
+        ds, batch_size=batch_size, shuffle=True, num_workers=0, pin_memory=pin_memory
     )
     test_loader = DataLoader(
-        ds, batch_size=batch_size, shuffle=False, num_workers=4, pin_memory=pin_memory
+        ds, batch_size=batch_size, shuffle=False, num_workers=0, pin_memory=pin_memory
     )
     return train_loader, test_loader
 
@@ -363,7 +363,7 @@ def inference(self, backend: Backend):
             self.dataset,
             batch_size=self.batch_size,
             shuffle=False,
-            num_workers=4,
+            num_workers=0,
             pin_memory=backend.device_name == "cuda",
         )
 

From 1b06c2d4cc9092373e5f04c8a612f84eb1ca74f5 Mon Sep 17 00:00:00 2001
From: Egor <egor.krivov@intel.com>
Date: Thu, 8 Feb 2024 17:28:39 +0100
Subject: [PATCH 12/40] Removed python311 from ipex (#79)

---
 tests/conda-envs/ipex.yaml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/conda-envs/ipex.yaml b/tests/conda-envs/ipex.yaml
index d6d5b2d..d136704 100644
--- a/tests/conda-envs/ipex.yaml
+++ b/tests/conda-envs/ipex.yaml
@@ -3,7 +3,6 @@ channels:
   - intel
   - conda-forge
 dependencies:
-  - python==3.11
   - intel-aikit-pytorch
   - pytorch>=2.0.1=*_xpu_*
   - intel-extension-for-pytorch

From 8029719f785e842d824ef4b1b02943cce504a87e Mon Sep 17 00:00:00 2001
From: Egor <egor.krivov@intel.com>
Date: Thu, 8 Feb 2024 17:29:26 +0100
Subject: [PATCH 13/40] Refactored warmup, increased dataset size for MLP (#78)

---
 dl_bench/mlp.py   |  6 +++---
 dl_bench/utils.py | 30 +++++++++---------------------
 2 files changed, 12 insertions(+), 24 deletions(-)

diff --git a/dl_bench/mlp.py b/dl_bench/mlp.py
index c05ddc2..93845a8 100644
--- a/dl_bench/mlp.py
+++ b/dl_bench/mlp.py
@@ -81,8 +81,8 @@ def __init__(self, params) -> None:
 
         batch_size = int(params.get("batch_size", 1024))
 
-        min_batches = 10
-        DATASET_SIZE = max(10_240, batch_size * min_batches)
+        min_batches = 20
+        DATASET_SIZE = max(102_400, batch_size * min_batches)
         dataset = RandomInfDataset(DATASET_SIZE, in_shape)
 
         name = params.get("name", "size5")
@@ -92,5 +92,5 @@ def __init__(self, params) -> None:
 
         super().__init__(
             net=net, in_shape=in_shape, dataset=dataset, batch_size=batch_size,\
-                min_batches=min_batches, min_seconds=min_seconds
+                min_batches=min_batches, min_seconds=min_seconds, warmup_batches=10
         )
diff --git a/dl_bench/utils.py b/dl_bench/utils.py
index 28776e5..9300fa2 100644
--- a/dl_bench/utils.py
+++ b/dl_bench/utils.py
@@ -343,12 +343,13 @@ def _get_device(device_name):
 
 class Benchmark:
     def __init__(
-        self, net, in_shape, dataset, batch_size, min_batches=10, min_seconds=10
+        self, net, in_shape, dataset, batch_size, min_batches=10, min_seconds=10, warmup_batches=3,
     ) -> None:
         self.net = net
         self.in_shape = in_shape
         self.dataset = dataset
         self.batch_size = batch_size
+        self.warmup_batches = warmup_batches
         self.min_batches = min_batches
         self.min_seconds = min_seconds
 
@@ -379,24 +380,6 @@ def inference(self, backend: Backend):
         sample = next(iter(test_loader))
         self.compile(sample, backend)
 
-        print("Warmup started")
-        with torch.no_grad(), tm.timeit("warmup_s"):
-            self.net.eval()
-            sample = backend.to_device(sample)
-            if backend.dtype != torch.float32:
-                with torch.autocast(
-                    device_type=backend.device_name,
-                    dtype=backend.dtype,
-                ):
-                    self.net(sample)
-                    self.net(sample)
-                    self.net(sample)
-            else:
-                self.net(sample)
-                self.net(sample)
-                self.net(sample)
-        print("Warmup done")
-
         n_items = 0
 
         self.net.eval()
@@ -417,7 +400,11 @@ def inference(self, backend: Backend):
                             y = self.net(x)
                     else:
                         y = self.net(x)
-                    if i < 3: continue
+
+                    if i < self.warmup_batches:
+                        start = time.perf_counter()
+                        continue
+
                     fw_times.append(get_time() - s)
                     n_items += len(x)
                     outputs.append(y)
@@ -425,7 +412,7 @@ def inference(self, backend: Backend):
                     # early stopping if we have 10+ batches and were running for 10+ seconds
                     if (
                         (time.perf_counter() - start) > self.min_seconds
-                        and n_items > self.batch_size * self.min_batches
+                        and n_items >= self.batch_size * self.min_batches
                     ):
                         break
 
@@ -437,6 +424,7 @@ def inference(self, backend: Backend):
         )
 
         results = tm.get_results()
+        results["duration_s"] = get_time() - start
         results["samples_per_s"] = n_items / sum(fw_times)
         results["flops_per_sample"] = self.flops_per_sample
 

From 3cd8a8fc96262196acfa21158c496d46b4044c7f Mon Sep 17 00:00:00 2001
From: Egor Krivov <egor.krivov@intel.com>
Date: Mon, 12 Feb 2024 10:38:47 +0000
Subject: [PATCH 14/40] update

---
 dl_bench/llm.py   |  7 ++++---
 dl_bench/utils.py |  4 +++-
 scripts/margin.sh | 27 ++++++++++++---------------
 3 files changed, 19 insertions(+), 19 deletions(-)
 mode change 100644 => 100755 scripts/margin.sh

diff --git a/dl_bench/llm.py b/dl_bench/llm.py
index 1f01256..d4001cf 100644
--- a/dl_bench/llm.py
+++ b/dl_bench/llm.py
@@ -1,6 +1,7 @@
 import time
 
 import torch
+import intel_extension_for_pytorch as ipex
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from dl_bench.utils import TimerManager, Benchmark, str_to_dtype
@@ -12,7 +13,7 @@ def get_llm(name, dtype):
 
     model_name = "EleutherAI/gpt-j-6B"
 
-    model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=dtype)
+    model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=dtype, torchscript=True)
     tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B")
     return tokenizer, model
 
@@ -53,11 +54,11 @@ def inference(self, backend):
 
         print("Warmup started")
         with torch.inference_mode(), tm.timeit("warmup_s"):
-            self.model.eval()
+            # self.model.eval()
             self.generate(self.warmup_prompt)
         print("Warmup done")
 
-        self.model.eval()
+        # self.model.eval()
         enabled = backend.dtype != torch.float32
         with torch.inference_mode(), torch.autocast(
             enabled=enabled, device_type=backend.device_name
diff --git a/dl_bench/utils.py b/dl_bench/utils.py
index 9300fa2..57a77a0 100644
--- a/dl_bench/utils.py
+++ b/dl_bench/utils.py
@@ -184,7 +184,9 @@ def _compile_transformer_model(compile_mode, model, dtype=torch.bfloat16):
             import intel_extension_for_pytorch as ipex
 
             params = {} if dtype != torch.bfloat16 else {"dtype": torch.bfloat16}
-            compiled_model = ipex.optimize_transformers(model, **params)
+            #compiled_model = ipex.llm.optimize(model, **params, inplace=True, deployment_mode=True)
+            compiled_model = ipex.llm.optimize(model, **params)
+            # compiled_model = ipex.optimize_transformers(model, **params)
             print("Compiled with ipex")
         elif compile_mode == "ipex_onednn_graph":
             raise NotImplementedError()
diff --git a/scripts/margin.sh b/scripts/margin.sh
old mode 100644
new mode 100755
index 9bb452e..4fa8905
--- a/scripts/margin.sh
+++ b/scripts/margin.sh
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 
 set -x
 
@@ -8,21 +8,18 @@ export KMP_AFFINITY="respect,noreset,granularity=fine,balanced"
 export OMP_NUM_THREADS=32
 export ONEDNN_VERBOSE=0
 
-if [[ -z "${DL_BENCH_ARGS}" ]]; then
-  echo "Please, provide DL_BENCH_ARGS environment variable"
-  exit 1
-fi
-
 CNNS=(resnet50)
 for COMPILER in dynamo ipex_onednn_graph
-for DTYPE in float32 bfloat16
 do
-  for BS in 0001 0032 0128
-  do
-      for name in "${CNNS[@]}"
-      do
-          echo "Benchmark $name with BS=$BS and DTYPE=$DTYPE"
-          numactl -N 1 benchmark-run -b cnn -p "name='${name}',batch_size='$BS'" --dtype "${DTYPE}" --benchmark_desc "${name}_bs$BS" --host "${HOST}" -c "${COMPILER}"
-      done
-  done
+	for DTYPE in float32 bfloat16
+	do
+	  for BS in 0001 0032 0128
+	  do
+	      for name in "${CNNS[@]}"
+	      do
+		  echo "Benchmark $name with BS=$BS and DTYPE=$DTYPE"
+		  numactl -N 1 benchmark-run -b cnn -p "name='${name}',batch_size='$BS'" --dtype "${DTYPE}" --benchmark_desc "${name}_bs$BS" --host "${HOST}" -c "${COMPILER}"
+	      done
+	  done
+	done
 done

From eee627bc036eee00e3d590967cec027a94a6d366 Mon Sep 17 00:00:00 2001
From: Egor Krivov <egor.krivov@intel.com>
Date: Mon, 12 Feb 2024 16:56:23 +0000
Subject: [PATCH 15/40] update

---
 dl_bench/cli/launcher.py | 1 +
 dl_bench/utils.py        | 2 ++
 scripts/margin.sh        | 2 +-
 3 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/dl_bench/cli/launcher.py b/dl_bench/cli/launcher.py
index 6115496..92aa295 100644
--- a/dl_bench/cli/launcher.py
+++ b/dl_bench/cli/launcher.py
@@ -194,6 +194,7 @@ def main():
                 / (10**12)
             )
         )
+        print("FPS: {:.1f}".format(results.get("samples_per_s", 0)))
         pprint.pprint(report)
 
     if args.output is not None:
diff --git a/dl_bench/utils.py b/dl_bench/utils.py
index 57a77a0..1f43e84 100644
--- a/dl_bench/utils.py
+++ b/dl_bench/utils.py
@@ -145,6 +145,7 @@ def __init__(self, device, compiler, dtype="float32") -> None:
         self.dtype = str_to_dtype(dtype)
 
     def to_device(self, x: torch.Tensor):
+        x = x.contiguous(memory_format=torch.channels_last)
         if self.device_name in ("cuda", "xpu"):
             return x.to(self.device)
         elif self.device_name == "cpu":
@@ -230,6 +231,7 @@ def _compile_model(compile_mode: str, device, model: Module, sample_input, dtype
             # enable oneDNN graph fusion globally
             torch.jit.enable_onednn_fusion(True)
             compiled_model = torch.jit.trace(model, sample_input)
+
             compiled_model = torch.jit.freeze(compiled_model)
             print("Compiled with torchscript onednn")
         elif compile_mode == "ipex":
diff --git a/scripts/margin.sh b/scripts/margin.sh
index 4fa8905..abbeced 100755
--- a/scripts/margin.sh
+++ b/scripts/margin.sh
@@ -18,7 +18,7 @@ do
 	      for name in "${CNNS[@]}"
 	      do
 		  echo "Benchmark $name with BS=$BS and DTYPE=$DTYPE"
-		  numactl -N 1 benchmark-run -b cnn -p "name='${name}',batch_size='$BS'" --dtype "${DTYPE}" --benchmark_desc "${name}_bs$BS" --host "${HOST}" -c "${COMPILER}"
+		  numactl -N 1 benchmark-run -b cnn -p "name='${name}',batch_size='$BS'" --dtype "${DTYPE}" --benchmark_desc "${name}_bs$BS" --host "${HOST}" -c "${COMPILER}" --verbose --skip_verification
 	      done
 	  done
 	done

From 95a20101a0d5d463cbec9fe3bcee125cdd76c443 Mon Sep 17 00:00:00 2001
From: Egor <egor.krivov@intel.com>
Date: Mon, 12 Feb 2024 18:00:47 +0100
Subject: [PATCH 16/40] Added sync for nvidia backend (#84)

---
 dl_bench/utils.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/dl_bench/utils.py b/dl_bench/utils.py
index 1f43e84..f28d186 100644
--- a/dl_bench/utils.py
+++ b/dl_bench/utils.py
@@ -153,6 +153,10 @@ def to_device(self, x: torch.Tensor):
         else:
             raise ValueError("Unknown device")
 
+    def sync(self):
+        if self.device_name == 'cuda':
+            torch.cuda.synchronize()
+
     def prepare_eval_transformer(self, model):
         model = model.to(memory_format=torch.channels_last)
 
@@ -394,6 +398,7 @@ def inference(self, backend: Backend):
             # Duration is inconsistent now
             with tm.timeit("duration_s"):
                 for i, x in enumerate(test_loader):
+                    backend.sync()
                     s = get_time()
                     x = backend.to_device(x)
                     if backend.dtype != torch.float32:
@@ -409,6 +414,7 @@ def inference(self, backend: Backend):
                         start = time.perf_counter()
                         continue
 
+                    backend.sync()
                     fw_times.append(get_time() - s)
                     n_items += len(x)
                     outputs.append(y)

From 7c1ef81ab80c6fdeabbbf802eacb2fd831eec71f Mon Sep 17 00:00:00 2001
From: Egor <egor.krivov@intel.com>
Date: Tue, 13 Feb 2024 15:19:20 +0100
Subject: [PATCH 17/40] Added sql code for clean view (#86)

---
 db_tools/create_view.sql | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)
 create mode 100644 db_tools/create_view.sql

diff --git a/db_tools/create_view.sql b/db_tools/create_view.sql
new file mode 100644
index 0000000..bea9518
--- /dev/null
+++ b/db_tools/create_view.sql
@@ -0,0 +1,16 @@
+CREATE OR REPLACE VIEW torchmlir_benchmark_view AS
+SELECT
+    id,
+    REPLACE(REPLACE(CONCAT(host, '-', compiler, '-', dtype, '-', tag), 'torchscript', 'ts'), '-ci', '') AS backend,
+    host,
+    device,
+    compiler,
+    dtype,
+    tag,
+    benchmark,
+    benchmark_desc,
+    samples_per_s AS items_per_s,
+    flops_per_sample,
+    flops_per_sample * samples_per_s / 1e12 AS tflops,
+    date
+FROM torchmlir_benchmark;

From 09f178e13bfa45a33d5775c2bbcd1473cd469b79 Mon Sep 17 00:00:00 2001
From: Egor <egor.krivov@intel.com>
Date: Tue, 13 Feb 2024 18:00:18 +0100
Subject: [PATCH 18/40] Added more measurement info like p50, p90 (#87)

---
 .github/workflows/execute-test-script.yml |  2 +-
 .github/workflows/test-single-config.yml  |  1 -
 dl_bench/cli/launcher.py                  | 23 ++++----
 dl_bench/utils.py                         | 69 +++++++++++++----------
 4 files changed, 49 insertions(+), 46 deletions(-)

diff --git a/.github/workflows/execute-test-script.yml b/.github/workflows/execute-test-script.yml
index 20ac472..5d1f23c 100644
--- a/.github/workflows/execute-test-script.yml
+++ b/.github/workflows/execute-test-script.yml
@@ -116,7 +116,7 @@ jobs:
                       URL="--url ${{ secrets.DB_URL }}"
                   fi
 
-                  export DL_BENCH_ARGS="--host ${{ inputs.runner_type }} --compiler ${{ inputs.compiler }} --device ${{ inputs.device }} --tag ${{ inputs.tag }} -v ${URL}"
+                  export DL_BENCH_ARGS="--host ${{ inputs.runner_type }} --compiler ${{ inputs.compiler }} --device ${{ inputs.device }} --tag ${{ inputs.tag }} ${URL}"
 
                   # We mainly want to verify our own backend
                   if [[ ${{ inputs.compiler }} != *torch_mlir* ]]; then
diff --git a/.github/workflows/test-single-config.yml b/.github/workflows/test-single-config.yml
index ea2d1de..3e8b323 100644
--- a/.github/workflows/test-single-config.yml
+++ b/.github/workflows/test-single-config.yml
@@ -78,7 +78,6 @@ jobs:
             torch_mlir_repo: ${{ inputs.torch_mlir_repo }}
             torch_mlir_branch: ${{ inputs.torch_mlir_branch }}
             runner_type: ${{ inputs.runner_type }}
-            shutdown_cloud_runner: ${{ inputs.shutdown_cloud_runner }}
             test_script: ${{ matrix.test_script }}
         secrets:
             DB_URL: ${{ secrets.DB_URL }}
diff --git a/dl_bench/cli/launcher.py b/dl_bench/cli/launcher.py
index 92aa295..fcf9025 100644
--- a/dl_bench/cli/launcher.py
+++ b/dl_bench/cli/launcher.py
@@ -110,9 +110,6 @@ def parse_args():
     parser.add_argument(
         "-o", "--output", required=False, help="Path to output report file."
     )
-    parser.add_argument(
-        "-v", "--verbose", required=False, action="store_true", help="Verbose mode."
-    )
     parser.add_argument(
         "--skip_verification",
         required=False,
@@ -185,17 +182,17 @@ def main():
 
     db = BenchmarkDb(args.url)
 
-    if args.verbose:
-        print("Report:")
-        print(
-            "TFLOPS: {:.3}".format(
-                results.get("flops_per_sample", 0)
-                * results.get("samples_per_s", 0)
-                / (10**12)
-            )
+    print("Report:")
+    print("FPS: {:.1f}".format(results.get("samples_per_s", 0)))
+    print(
+        "TFLOPS: {:.3}".format(
+            results.get("flops_per_sample", 0)
+            * results.get("samples_per_s", 0)
+            / (10**12)
         )
-        print("FPS: {:.1f}".format(results.get("samples_per_s", 0)))
-        pprint.pprint(report)
+    )
+    pprint.pprint(report)
+    pprint.pprint(results)
 
     if args.output is not None:
         with open(args.output, "w", encoding="UTF-8") as out:
diff --git a/dl_bench/utils.py b/dl_bench/utils.py
index f28d186..46b2386 100644
--- a/dl_bench/utils.py
+++ b/dl_bench/utils.py
@@ -389,54 +389,61 @@ def inference(self, backend: Backend):
         self.compile(sample, backend)
 
         n_items = 0
-
-        self.net.eval()
         outputs = []
         fw_times = []
+
+        self.net.eval()
         with torch.no_grad():
             start = time.perf_counter()
-            # Duration is inconsistent now
-            with tm.timeit("duration_s"):
-                for i, x in enumerate(test_loader):
-                    backend.sync()
-                    s = get_time()
-                    x = backend.to_device(x)
-                    if backend.dtype != torch.float32:
-                        with torch.autocast(
-                            device_type=backend.device_name,
-                            dtype=backend.dtype,
-                        ):
-                            y = self.net(x)
-                    else:
+            for i, x in enumerate(test_loader):
+                backend.sync()
+                s = get_time()
+                x = backend.to_device(x)
+                if backend.dtype != torch.float32:
+                    with torch.autocast(
+                        device_type=backend.device_name,
+                        dtype=backend.dtype,
+                    ):
                         y = self.net(x)
+                else:
+                    y = self.net(x)
 
-                    if i < self.warmup_batches:
-                        start = time.perf_counter()
-                        continue
+                backend.sync()
 
-                    backend.sync()
-                    fw_times.append(get_time() - s)
-                    n_items += len(x)
-                    outputs.append(y)
+                if i < self.warmup_batches:
+                    # We restart timer because that was just a warmup
+                    start = time.perf_counter()
+                    continue
 
-                    # early stopping if we have 10+ batches and were running for 10+ seconds
-                    if (
-                        (time.perf_counter() - start) > self.min_seconds
-                        and n_items >= self.batch_size * self.min_batches
-                    ):
-                        break
+                fw_times.append(get_time() - s)
+                n_items += len(x)
+                outputs.append(y)
+
+                # early stopping if we have 10+ batches and were running for 10+ seconds
+                if (
+                    (time.perf_counter() - start) > self.min_seconds
+                    and n_items >= self.batch_size * self.min_batches
+                ):
+                    break
+
+                if (get_time() - start) > max_time:
+                    break
 
-                    if (get_time() - start) > max_time:
-                        break
+        stop = get_time()
 
         print(
             f"Latency 0%-5%-50%-95%-100% are: {np.percentile(fw_times, [0, 5, 50, 95, 100])}"
         )
 
         results = tm.get_results()
-        results["duration_s"] = get_time() - start
+        results["duration_s"] = stop - start
         results["samples_per_s"] = n_items / sum(fw_times)
+        results["dirty_items_per_s"] = n_items / results["duration_s"]
         results["flops_per_sample"] = self.flops_per_sample
+        results["n_items"] = n_items
+        results["p50"] = np.percentile(fw_times, 50)
+        results["p90"] = np.percentile(fw_times, 90)
+        results["p100"] = max(fw_times)
 
         return results, outputs
 

From 0f01f6bc74a8a223525cb1f31261888d83caeef5 Mon Sep 17 00:00:00 2001
From: Egor <egor.krivov@intel.com>
Date: Tue, 13 Feb 2024 18:24:58 +0100
Subject: [PATCH 19/40] Fixed llm code for nvidia (#85)

---
 dl_bench/llm.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/dl_bench/llm.py b/dl_bench/llm.py
index d4001cf..5cf426f 100644
--- a/dl_bench/llm.py
+++ b/dl_bench/llm.py
@@ -32,12 +32,15 @@ def __init__(self, params) -> None:
             "num_beams": 4,
         }
 
-    def generate(self, prompt):
+    def generate(self, prompt, backend):
         input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids
+        backend.sync()
         start = time.perf_counter()
+        input_ids = backend.to_device(input_ids)
         gen_tokens = self.model.generate(
             input_ids, **self.gen_kwargs, pad_token_id=self.tokenizer.eos_token_id
         )
+        backend.sync()
         total_time = time.perf_counter() - start
 
         # text = self.tokenizer.batch_decode(gen_tokens)[0]
@@ -54,8 +57,8 @@ def inference(self, backend):
 
         print("Warmup started")
         with torch.inference_mode(), tm.timeit("warmup_s"):
-            # self.model.eval()
-            self.generate(self.warmup_prompt)
+            self.model.eval()
+            self.generate(self.warmup_prompt, backend)
         print("Warmup done")
 
         # self.model.eval()
@@ -63,7 +66,7 @@ def inference(self, backend):
         with torch.inference_mode(), torch.autocast(
             enabled=enabled, device_type=backend.device_name
         ), tm.timeit("duration_s"):
-            tokens, total_time = self.generate(self.prompt)
+            tokens, total_time = self.generate(self.prompt, backend)
         outputs = [tokens]
 
         results = tm.get_results()

From 0710c6efdf58c049c5e77154629a84eebd8c3c13 Mon Sep 17 00:00:00 2001
From: Egor <egor.krivov@intel.com>
Date: Mon, 12 Feb 2024 18:00:47 +0100
Subject: [PATCH 20/40] Added sync for nvidia backend (#84)

---
 .vscode/settings.json                  |   3 +++
 cpu-dynamo-cnnsh-results.db/results.db | Bin 0 -> 8192 bytes
 cpu-dynamo-mlpsh-results.db/results.db | Bin 0 -> 8192 bytes
 dl_bench/results.db                    | Bin 0 -> 8192 bytes
 scripts/margin_setup.sh                |   5 +++++
 5 files changed, 8 insertions(+)
 create mode 100644 .vscode/settings.json
 create mode 100644 cpu-dynamo-cnnsh-results.db/results.db
 create mode 100644 cpu-dynamo-mlpsh-results.db/results.db
 create mode 100644 dl_bench/results.db
 create mode 100644 scripts/margin_setup.sh

diff --git a/.vscode/settings.json b/.vscode/settings.json
new file mode 100644
index 0000000..457f44d
--- /dev/null
+++ b/.vscode/settings.json
@@ -0,0 +1,3 @@
+{
+    "python.analysis.typeCheckingMode": "basic"
+}
\ No newline at end of file
diff --git a/cpu-dynamo-cnnsh-results.db/results.db b/cpu-dynamo-cnnsh-results.db/results.db
new file mode 100644
index 0000000000000000000000000000000000000000..a06c830f45de28b2aa8f5770fbc1961cc0a4f939
GIT binary patch
literal 8192
zcmeH~%}>-o7{;f7Ss~#gSrbEyW@sS0gb(|n2<yRCD2go0S_%nKQ)b(;RJI>!yKB&B
zL_PQ?a8XZsF<v}qq8{)+*sC`aPa5yu)F}v&q$?!c%=@O3&b-evGf#fEnY)^|BMUEk
zfkPrJqAtWRNWvH))IvuS9XrL)N!;71c)ee>pyB0j?bOAzKSA_^0s=q)2mk>f00e*l
z5C8%|00;nqBm$3?nACAmWFD6zQmR;y7Z@u}#SZk6<<c1mRufuH<EkQS3fAO{c?BmJ
zaJH@0Ht|$JQ!XhgF3f7UFrUv4;<i?k(6`_O-K|Z{D0o7dl;`sr&W_Kks#4JOxI3rG
zGsXTryCPD?3$i+XSyr<G$L$q&qw1y=8VB$lz@qykAWn#{%*__|Bq<SN)pE^!u3GWJ
zsPQePRkIDNv2J*dZ&$3KvBQkk{C^y9iv&*9*F!v+pOq>5zxh`K64{=cD2BxGD^{rc
zR*<MKS3LiZy5&qbzo<^l$m$}VRu*yAHv8N9<F|Qp$5GUIz2i0gK>-0E00e*l5C8%|
z00;m9AOHk_z<*9)J=5HU#PmR#obFC{pGc)rM+8m~hj?K~7{ffD6Zjk_kRS@I(6yqo
zoL&k!P81B+y~Dc1vDh4DcQkeovn3K4D|%?(-cn*KdzU!o$QeU7{HkuQ(M{fVgRZxi
z^aV>IQIIm{&7ad_Na`&te0+WiNlV3d{Ald~V!l87QD4tA?UR(tap!iE@}rHD@}q}M
ziU)d%oj31Gy~8av^XsNmefjjo+b<46pEfe}^&Tcp+Lyl8fGIzg<Kv{Ya+x1Vin%2m
jHY2-+IzPPnEcNx=__Z;-DNP)E^Zwr8NtDVxyiorQ@OxDM

literal 0
HcmV?d00001

diff --git a/cpu-dynamo-mlpsh-results.db/results.db b/cpu-dynamo-mlpsh-results.db/results.db
new file mode 100644
index 0000000000000000000000000000000000000000..455cd514c9932e4adf9963f99380e22fe52391b4
GIT binary patch
literal 8192
zcmeI#&ui0Q7zglVt&~nS2dRu9;!9!Bsk8Y}mGzX^Wp3IZGtGpZ;@c!`!<r@}nKFm4
zfd~JB;X&~r=t*`I#t`)6!J7vWQO^@R?IIq;H=COlnzi1ahlb{TpC`$e&+V<=C|ZGu
z>W*jQ014CxMbngs5JgdOat)B{plEUv?j2OP?p1N>LjBtinWBdtQ{)E)0w4eaAOHd&
z00JNY0w4eaAka<V{w+E&DhTw0Mu2NAGjKe;X}2s-tC?+bhrOjPTLT3}mQ)$3(v_l&
zx(uMqVA3+sTv?T`$_gr1R8(Fl7SE%>q=Cs<aGE@=&Xr^|Ezd{`MHOWV3yLC_RW01D
zs#0lwqT{ImH_%N<Da=YrhG*H%=3!LLFnzs`xjsfT7kk+D(Y0!&+>xY)^(C`y^!QZM
z@q_5E80KwDH>3NyW4l(%^r9z>V8uP+1$VG#FT0wLW{MSw*#GOl?BT$2+FhGIw%wNL
zYp&^a?bln5`=`4%+2#H7%3MiO7SVNi5oIi6VsIino5zM{sT0BQ7xD)M0w4eaAOHd&
z00JNY0w4eaAOHeK3am4+5lUdP%v5%a89S9oB;q{F3zHl_$xk7U%kzAm&0#O_t-Iz$
zEx_%jR`WTQ&)F?^HQmOxna-p1-){OIwv9_UUDMrV%~&A=&i)2j?=Q)D7Y70_p5FX$
zj(xo*4n6$w)k2i`?Cta8pLTY}UzAhp%s`J+Ts}8-n98>rc66$++kdJv)mO3i8*Adp
zW4~6{x7Wlck2ZGYt*3jbl607AoXJL6ae0A1OcllSA6d@&yIW(FnEHO%e)o}N`DwiT
LygfR;`6l%n!p=*x

literal 0
HcmV?d00001

diff --git a/dl_bench/results.db b/dl_bench/results.db
new file mode 100644
index 0000000000000000000000000000000000000000..064d103a8d029f1d275998ec039bdd255aaf11d3
GIT binary patch
literal 8192
zcmeI0%WB&|6oyA8U}6Xjq$aDvAPBL+DY~>FID|%Vl+>+kr?T9V)pRV4V>OZ`q;X4I
z3N2)jZ6Bae=(fu~Rp|?~tB!<wF~~_C;2cJpng9HA;oF_*Ke2qy@F+?`%CU}CktCsg
zj1fYm*(bU@B=KLGjk7MBUl%2Gcl3QlxTKX&NHho_00e*l5C8%|00;m9AOHk_01zl5
zaPeF!uj{&WdB*85U_46PaTxds8M28O(d4{9Dz$9Wa7^qN2bPHo47gTV_C4J0I_9xy
z<8IHv-GOB_aAnz}q82<9tDSbo#E0gQF|Zt5YYlAM>^dY<`;O5$sn4zA^b9{W?AAlW
zu4#%gpZt4O;<41df$s(u64Qi+DSp)Nb?0s}r0zMJc-I^?j#9q(6py|1UA8#yMq%s+
zELrUE`0MzJ4PMbCoW>-@M^?`e?EmLKO(^%HNntXjVH~iO#4IVykAf(kWpii2{wH?3
zW7wzov3ZJXzE`i*^V<B@F%k^|2mk>f00e*l5C8%|00;m9AOHk_01&vEz=v&V73tN^
z+Oys2+WMVxxxAz)n!crKTgnbrm8PaPl^xDf9tJU)(2(tIrv4jtf3q>9+#Qo_vZpHA
zJ&}JaX9l^6<=i4Su>8*?<jnRi+l#pIl;kfwWGJeC%GK>m7HTg3(H{Kjy{L-&@7I5R
T`|(-ymtWq0yh!AYueZ@};5GDw

literal 0
HcmV?d00001

diff --git a/scripts/margin_setup.sh b/scripts/margin_setup.sh
new file mode 100644
index 0000000..b3411b9
--- /dev/null
+++ b/scripts/margin_setup.sh
@@ -0,0 +1,5 @@
+wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
+sh -y ./Miniconda3-latest-Linux-x86_64.sh
+
+# get github repo
+conda install gh -c conda-forge --solver libmamba
\ No newline at end of file

From 2056ace335b6594a98e18b78ab0d86eb3b6da404 Mon Sep 17 00:00:00 2001
From: Egor Krivov <egor.krivov@intel.com>
Date: Thu, 15 Feb 2024 13:25:28 +0100
Subject: [PATCH 21/40] cleaned

---
 .vscode/settings.json                  |   3 ---
 cpu-dynamo-cnnsh-results.db/results.db | Bin 8192 -> 0 bytes
 cpu-dynamo-mlpsh-results.db/results.db | Bin 8192 -> 0 bytes
 3 files changed, 3 deletions(-)
 delete mode 100644 .vscode/settings.json
 delete mode 100644 cpu-dynamo-cnnsh-results.db/results.db
 delete mode 100644 cpu-dynamo-mlpsh-results.db/results.db

diff --git a/.vscode/settings.json b/.vscode/settings.json
deleted file mode 100644
index 457f44d..0000000
--- a/.vscode/settings.json
+++ /dev/null
@@ -1,3 +0,0 @@
-{
-    "python.analysis.typeCheckingMode": "basic"
-}
\ No newline at end of file
diff --git a/cpu-dynamo-cnnsh-results.db/results.db b/cpu-dynamo-cnnsh-results.db/results.db
deleted file mode 100644
index a06c830f45de28b2aa8f5770fbc1961cc0a4f939..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 8192
zcmeH~%}>-o7{;f7Ss~#gSrbEyW@sS0gb(|n2<yRCD2go0S_%nKQ)b(;RJI>!yKB&B
zL_PQ?a8XZsF<v}qq8{)+*sC`aPa5yu)F}v&q$?!c%=@O3&b-evGf#fEnY)^|BMUEk
zfkPrJqAtWRNWvH))IvuS9XrL)N!;71c)ee>pyB0j?bOAzKSA_^0s=q)2mk>f00e*l
z5C8%|00;nqBm$3?nACAmWFD6zQmR;y7Z@u}#SZk6<<c1mRufuH<EkQS3fAO{c?BmJ
zaJH@0Ht|$JQ!XhgF3f7UFrUv4;<i?k(6`_O-K|Z{D0o7dl;`sr&W_Kks#4JOxI3rG
zGsXTryCPD?3$i+XSyr<G$L$q&qw1y=8VB$lz@qykAWn#{%*__|Bq<SN)pE^!u3GWJ
zsPQePRkIDNv2J*dZ&$3KvBQkk{C^y9iv&*9*F!v+pOq>5zxh`K64{=cD2BxGD^{rc
zR*<MKS3LiZy5&qbzo<^l$m$}VRu*yAHv8N9<F|Qp$5GUIz2i0gK>-0E00e*l5C8%|
z00;m9AOHk_z<*9)J=5HU#PmR#obFC{pGc)rM+8m~hj?K~7{ffD6Zjk_kRS@I(6yqo
zoL&k!P81B+y~Dc1vDh4DcQkeovn3K4D|%?(-cn*KdzU!o$QeU7{HkuQ(M{fVgRZxi
z^aV>IQIIm{&7ad_Na`&te0+WiNlV3d{Ald~V!l87QD4tA?UR(tap!iE@}rHD@}q}M
ziU)d%oj31Gy~8av^XsNmefjjo+b<46pEfe}^&Tcp+Lyl8fGIzg<Kv{Ya+x1Vin%2m
jHY2-+IzPPnEcNx=__Z;-DNP)E^Zwr8NtDVxyiorQ@OxDM

diff --git a/cpu-dynamo-mlpsh-results.db/results.db b/cpu-dynamo-mlpsh-results.db/results.db
deleted file mode 100644
index 455cd514c9932e4adf9963f99380e22fe52391b4..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 8192
zcmeI#&ui0Q7zglVt&~nS2dRu9;!9!Bsk8Y}mGzX^Wp3IZGtGpZ;@c!`!<r@}nKFm4
zfd~JB;X&~r=t*`I#t`)6!J7vWQO^@R?IIq;H=COlnzi1ahlb{TpC`$e&+V<=C|ZGu
z>W*jQ014CxMbngs5JgdOat)B{plEUv?j2OP?p1N>LjBtinWBdtQ{)E)0w4eaAOHd&
z00JNY0w4eaAka<V{w+E&DhTw0Mu2NAGjKe;X}2s-tC?+bhrOjPTLT3}mQ)$3(v_l&
zx(uMqVA3+sTv?T`$_gr1R8(Fl7SE%>q=Cs<aGE@=&Xr^|Ezd{`MHOWV3yLC_RW01D
zs#0lwqT{ImH_%N<Da=YrhG*H%=3!LLFnzs`xjsfT7kk+D(Y0!&+>xY)^(C`y^!QZM
z@q_5E80KwDH>3NyW4l(%^r9z>V8uP+1$VG#FT0wLW{MSw*#GOl?BT$2+FhGIw%wNL
zYp&^a?bln5`=`4%+2#H7%3MiO7SVNi5oIi6VsIino5zM{sT0BQ7xD)M0w4eaAOHd&
z00JNY0w4eaAOHeK3am4+5lUdP%v5%a89S9oB;q{F3zHl_$xk7U%kzAm&0#O_t-Iz$
zEx_%jR`WTQ&)F?^HQmOxna-p1-){OIwv9_UUDMrV%~&A=&i)2j?=Q)D7Y70_p5FX$
zj(xo*4n6$w)k2i`?Cta8pLTY}UzAhp%s`J+Ts}8-n98>rc66$++kdJv)mO3i8*Adp
zW4~6{x7Wlck2ZGYt*3jbl607AoXJL6ae0A1OcllSA6d@&yIW(FnEHO%e)o}N`DwiT
LygfR;`6l%n!p=*x


From 6d5e68a391e07308e8a6ad1efbfe2fbc40dc9537 Mon Sep 17 00:00:00 2001
From: Egor Krivov <egor.krivov@intel.com>
Date: Thu, 15 Feb 2024 13:44:00 +0100
Subject: [PATCH 22/40] Merged

---
 dl_bench/mlp.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/dl_bench/mlp.py b/dl_bench/mlp.py
index 96680a2..9715197 100644
--- a/dl_bench/mlp.py
+++ b/dl_bench/mlp.py
@@ -89,8 +89,6 @@ def __init__(self, params) -> None:
 
         name = params.get("name", "size5")
         net = get_mlp(n_chans_in=IN_FEAT, n_chans_out=N_CLASSES, name=name)
-        min_batches = int(params.get("min_batches", 10))
-        min_seconds = int(params.get("min_seconds", 10))
 
         super().__init__(
             net=net,

From 2d287416dd4a99a2d84e8004c1d4abdac387b73e Mon Sep 17 00:00:00 2001
From: Egor <egor.krivov@intel.com>
Date: Mon, 12 Feb 2024 18:00:47 +0100
Subject: [PATCH 23/40] Added sync for nvidia backend (#84)

---
 dl_bench/utils.py | 36 ++++++++++++++++++++++++++++--------
 1 file changed, 28 insertions(+), 8 deletions(-)

diff --git a/dl_bench/utils.py b/dl_bench/utils.py
index 46b2386..c9e6769 100644
--- a/dl_bench/utils.py
+++ b/dl_bench/utils.py
@@ -395,14 +395,34 @@ def inference(self, backend: Backend):
         self.net.eval()
         with torch.no_grad():
             start = time.perf_counter()
-            for i, x in enumerate(test_loader):
-                backend.sync()
-                s = get_time()
-                x = backend.to_device(x)
-                if backend.dtype != torch.float32:
-                    with torch.autocast(
-                        device_type=backend.device_name,
-                        dtype=backend.dtype,
+            # Duration is inconsistent now
+            with tm.timeit("duration_s"):
+                for i, x in enumerate(test_loader):
+                    backend.sync()
+                    s = get_time()
+                    x = backend.to_device(x)
+                    if backend.dtype != torch.float32:
+                        with torch.autocast(
+                            device_type=backend.device_name,
+                            dtype=backend.dtype,
+                        ):
+                            y = self.net(x)
+                    else:
+                        y = self.net(x)
+
+                    if i < self.warmup_batches:
+                        start = time.perf_counter()
+                        continue
+
+                    backend.sync()
+                    fw_times.append(get_time() - s)
+                    n_items += len(x)
+                    outputs.append(y)
+
+                    # early stopping if we have 10+ batches and were running for 10+ seconds
+                    if (
+                        (time.perf_counter() - start) > self.min_seconds
+                        and n_items >= self.batch_size * self.min_batches
                     ):
                         y = self.net(x)
                 else:

From a295084e8c70be334b6ee671cd0f13d96e73ab18 Mon Sep 17 00:00:00 2001
From: Egor <egor.krivov@intel.com>
Date: Tue, 13 Feb 2024 18:00:18 +0100
Subject: [PATCH 24/40] Added more measurement info like p50, p90 (#87)

---
 dl_bench/utils.py | 36 ++++++++----------------------------
 1 file changed, 8 insertions(+), 28 deletions(-)

diff --git a/dl_bench/utils.py b/dl_bench/utils.py
index c9e6769..46b2386 100644
--- a/dl_bench/utils.py
+++ b/dl_bench/utils.py
@@ -395,34 +395,14 @@ def inference(self, backend: Backend):
         self.net.eval()
         with torch.no_grad():
             start = time.perf_counter()
-            # Duration is inconsistent now
-            with tm.timeit("duration_s"):
-                for i, x in enumerate(test_loader):
-                    backend.sync()
-                    s = get_time()
-                    x = backend.to_device(x)
-                    if backend.dtype != torch.float32:
-                        with torch.autocast(
-                            device_type=backend.device_name,
-                            dtype=backend.dtype,
-                        ):
-                            y = self.net(x)
-                    else:
-                        y = self.net(x)
-
-                    if i < self.warmup_batches:
-                        start = time.perf_counter()
-                        continue
-
-                    backend.sync()
-                    fw_times.append(get_time() - s)
-                    n_items += len(x)
-                    outputs.append(y)
-
-                    # early stopping if we have 10+ batches and were running for 10+ seconds
-                    if (
-                        (time.perf_counter() - start) > self.min_seconds
-                        and n_items >= self.batch_size * self.min_batches
+            for i, x in enumerate(test_loader):
+                backend.sync()
+                s = get_time()
+                x = backend.to_device(x)
+                if backend.dtype != torch.float32:
+                    with torch.autocast(
+                        device_type=backend.device_name,
+                        dtype=backend.dtype,
                     ):
                         y = self.net(x)
                 else:

From a349d0be43a9b5aed7cfa68e17794464cf394fd3 Mon Sep 17 00:00:00 2001
From: Egor Krivov <egor.krivov@intel.com>
Date: Thu, 15 Feb 2024 13:48:07 +0100
Subject: [PATCH 25/40] cleanup

---
 dl_bench/llm.py   | 4 ++--
 dl_bench/utils.py | 2 --
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/dl_bench/llm.py b/dl_bench/llm.py
index 5cf426f..71b53e5 100644
--- a/dl_bench/llm.py
+++ b/dl_bench/llm.py
@@ -13,7 +13,7 @@ def get_llm(name, dtype):
 
     model_name = "EleutherAI/gpt-j-6B"
 
-    model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=dtype, torchscript=True)
+    model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=dtype)
     tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B")
     return tokenizer, model
 
@@ -61,7 +61,7 @@ def inference(self, backend):
             self.generate(self.warmup_prompt, backend)
         print("Warmup done")
 
-        # self.model.eval()
+        self.model.eval()
         enabled = backend.dtype != torch.float32
         with torch.inference_mode(), torch.autocast(
             enabled=enabled, device_type=backend.device_name
diff --git a/dl_bench/utils.py b/dl_bench/utils.py
index 46b2386..875ea99 100644
--- a/dl_bench/utils.py
+++ b/dl_bench/utils.py
@@ -145,7 +145,6 @@ def __init__(self, device, compiler, dtype="float32") -> None:
         self.dtype = str_to_dtype(dtype)
 
     def to_device(self, x: torch.Tensor):
-        x = x.contiguous(memory_format=torch.channels_last)
         if self.device_name in ("cuda", "xpu"):
             return x.to(self.device)
         elif self.device_name == "cpu":
@@ -235,7 +234,6 @@ def _compile_model(compile_mode: str, device, model: Module, sample_input, dtype
             # enable oneDNN graph fusion globally
             torch.jit.enable_onednn_fusion(True)
             compiled_model = torch.jit.trace(model, sample_input)
-
             compiled_model = torch.jit.freeze(compiled_model)
             print("Compiled with torchscript onednn")
         elif compile_mode == "ipex":

From d3e3b3c7ed6838904df0c91232ab87adc87430aa Mon Sep 17 00:00:00 2001
From: Egor Krivov <egor.krivov@intel.com>
Date: Thu, 15 Feb 2024 14:05:42 +0100
Subject: [PATCH 26/40] updated margin

---
 scripts/margin.sh       |  3 ++-
 scripts/margin_setup.sh | 15 ++++++++++++++-
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/scripts/margin.sh b/scripts/margin.sh
index abbeced..fa1c0d9 100755
--- a/scripts/margin.sh
+++ b/scripts/margin.sh
@@ -5,7 +5,8 @@ set -x
 HOST="test"
 
 export KMP_AFFINITY="respect,noreset,granularity=fine,balanced"
-export OMP_NUM_THREADS=32
+export OMP_NUM_THREADS=$(grep ^cpu\\scores /proc/cpuinfo | uniq |  awk '{print $4}')
+echo "Cores configured $OMP_NUM_THREADS"
 export ONEDNN_VERBOSE=0
 
 CNNS=(resnet50)
diff --git a/scripts/margin_setup.sh b/scripts/margin_setup.sh
index b3411b9..c21662b 100644
--- a/scripts/margin_setup.sh
+++ b/scripts/margin_setup.sh
@@ -1,5 +1,18 @@
+# install miniconda
 wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
 sh -y ./Miniconda3-latest-Linux-x86_64.sh
 
 # get github repo
-conda install gh -c conda-forge --solver libmamba
\ No newline at end of file
+conda install gh -c conda-forge --solver libmamba
+
+# set up env
+conda create -y -n ipex python=3.11
+conda activate ipex
+# Install ipex & pytorch
+python -m pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
+python -m pip install intel-extension-for-pytorch
+python -m pip install oneccl_bind_pt --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/cpu/us/
+
+# Install benchmarks
+pip install -e .
+

From e6355d57b6ec7ce81c9d12bf67a321337e56e7e6 Mon Sep 17 00:00:00 2001
From: Egor Krivov <egor.krivov@intel.com>
Date: Thu, 15 Feb 2024 13:08:29 +0000
Subject: [PATCH 27/40] update

---
 scripts/margin.sh | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/scripts/margin.sh b/scripts/margin.sh
index fa1c0d9..248495c 100755
--- a/scripts/margin.sh
+++ b/scripts/margin.sh
@@ -19,8 +19,23 @@ do
 	      for name in "${CNNS[@]}"
 	      do
 		  echo "Benchmark $name with BS=$BS and DTYPE=$DTYPE"
-		  numactl -N 1 benchmark-run -b cnn -p "name='${name}',batch_size='$BS'" --dtype "${DTYPE}" --benchmark_desc "${name}_bs$BS" --host "${HOST}" -c "${COMPILER}" --verbose --skip_verification
+		  numactl -N 1 benchmark-run -b cnn -p "name='${name}',batch_size='$BS'" --dtype "${DTYPE}" --benchmark_desc "${name}_bs$BS" --host "${HOST}" -c "${COMPILER}" --skip_verification
 	      done
 	  done
 	done
 done
+
+
+LLMS=(gptj)
+for COMPILER in dynamo ipex
+do
+	for DTYPE in float32 bfloat16
+	do
+		for name in "${LMMS[@]}"
+		do
+		echo "Benchmark $name with DTYPE=$DTYPE"
+		numactl -N 1 benchmark-run -b llm -p "name='${name}'" --dtype "${DTYPE}" --benchmark_desc "${name}_bs$BS" --host "${HOST}" -c "${COMPILER}" --skip_verification
+		done
+	done
+done
+

From 7c28f274046e19185fd14c665ff3878d32d7ce74 Mon Sep 17 00:00:00 2001
From: Egor Krivov <egor.krivov@intel.com>
Date: Thu, 15 Feb 2024 17:49:45 +0100
Subject: [PATCH 28/40] Update

---
 scripts/margin.sh       | 10 +++++-----
 scripts/margin_setup.sh | 10 +++++++---
 2 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/scripts/margin.sh b/scripts/margin.sh
index 248495c..932c7a6 100755
--- a/scripts/margin.sh
+++ b/scripts/margin.sh
@@ -18,8 +18,8 @@ do
 	  do
 	      for name in "${CNNS[@]}"
 	      do
-		  echo "Benchmark $name with BS=$BS and DTYPE=$DTYPE"
-		  numactl -N 1 benchmark-run -b cnn -p "name='${name}',batch_size='$BS'" --dtype "${DTYPE}" --benchmark_desc "${name}_bs$BS" --host "${HOST}" -c "${COMPILER}" --skip_verification
+			echo "Benchmark $name with BS=$BS and DTYPE=$DTYPE"
+			numactl -N 1 benchmark-run -b cnn -p "name='${name}',batch_size='$BS'" --dtype "${DTYPE}" --benchmark_desc "${name}_bs$BS" --host "${HOST}" -c "${COMPILER}" --skip_verification | true
 	      done
 	  done
 	done
@@ -31,10 +31,10 @@ for COMPILER in dynamo ipex
 do
 	for DTYPE in float32 bfloat16
 	do
-		for name in "${LMMS[@]}"
+		for name in "${LLMS[@]}"
 		do
-		echo "Benchmark $name with DTYPE=$DTYPE"
-		numactl -N 1 benchmark-run -b llm -p "name='${name}'" --dtype "${DTYPE}" --benchmark_desc "${name}_bs$BS" --host "${HOST}" -c "${COMPILER}" --skip_verification
+			echo "Benchmark $name with DTYPE=$DTYPE"
+			numactl -N 1 benchmark-run -b llm -p "name='${name}'" --dtype "${DTYPE}" --benchmark_desc "${name}_bs$BS" --host "${HOST}" -c "${COMPILER}" --skip_verification
 		done
 	done
 done
diff --git a/scripts/margin_setup.sh b/scripts/margin_setup.sh
index c21662b..9ca50b2 100644
--- a/scripts/margin_setup.sh
+++ b/scripts/margin_setup.sh
@@ -1,3 +1,8 @@
+#!/bin/bash
+
+# We expect to have this repo present and this script run as
+# ./scripts/margin.sh
+
 # install miniconda
 wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
 sh -y ./Miniconda3-latest-Linux-x86_64.sh
@@ -6,8 +11,8 @@ sh -y ./Miniconda3-latest-Linux-x86_64.sh
 conda install gh -c conda-forge --solver libmamba
 
 # set up env
-conda create -y -n ipex python=3.11
-conda activate ipex
+conda create -y -n margin python=3.11
+conda activate margin
 # Install ipex & pytorch
 python -m pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
 python -m pip install intel-extension-for-pytorch
@@ -15,4 +20,3 @@ python -m pip install oneccl_bind_pt --extra-index-url https://pytorch-extension
 
 # Install benchmarks
 pip install -e .
-

From d085e41c12c06f3d6803f8e5dc19ce8f9e2cad16 Mon Sep 17 00:00:00 2001
From: Egor Krivov <egor.krivov@intel.com>
Date: Thu, 15 Feb 2024 18:42:46 +0100
Subject: [PATCH 29/40] update setup

---
 scripts/margin_setup.sh | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/scripts/margin_setup.sh b/scripts/margin_setup.sh
index 9ca50b2..0fb7948 100644
--- a/scripts/margin_setup.sh
+++ b/scripts/margin_setup.sh
@@ -5,10 +5,10 @@
 
 # install miniconda
 wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
-sh -y ./Miniconda3-latest-Linux-x86_64.sh
-
-# get github repo
-conda install gh -c conda-forge --solver libmamba
+curl -o Miniconda3-latest-Linux-x86_64.sh -L https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
+    sh Miniconda3-latest-Linux-x86_64.sh -u -b -p ./miniconda && \
+    rm -f Miniconda3-latest-Linux-x86_64.sh
+source ./miniconda/bin/activate
 
 # set up env
 conda create -y -n margin python=3.11

From 6ad377ea76700e5bbc409a6dd7810246f43aa1a5 Mon Sep 17 00:00:00 2001
From: Egor Krivov <egor.krivov@intel.com>
Date: Thu, 15 Feb 2024 18:43:30 +0100
Subject: [PATCH 30/40] added iteration

---
 scripts/margin.sh | 42 ++++++++++++++++++++++--------------------
 1 file changed, 22 insertions(+), 20 deletions(-)

diff --git a/scripts/margin.sh b/scripts/margin.sh
index 932c7a6..347e862 100755
--- a/scripts/margin.sh
+++ b/scripts/margin.sh
@@ -9,33 +9,35 @@ export OMP_NUM_THREADS=$(grep ^cpu\\scores /proc/cpuinfo | uniq |  awk '{print $
 echo "Cores configured $OMP_NUM_THREADS"
 export ONEDNN_VERBOSE=0
 
-CNNS=(resnet50)
-for COMPILER in dynamo ipex_onednn_graph
+for i in 1 2 3 4 5 6 7
 do
-	for DTYPE in float32 bfloat16
+	CNNS=(resnet50)
+	for COMPILER in dynamo ipex_onednn_graph
 	do
-	  for BS in 0001 0032 0128
-	  do
-	      for name in "${CNNS[@]}"
-	      do
-			echo "Benchmark $name with BS=$BS and DTYPE=$DTYPE"
-			numactl -N 1 benchmark-run -b cnn -p "name='${name}',batch_size='$BS'" --dtype "${DTYPE}" --benchmark_desc "${name}_bs$BS" --host "${HOST}" -c "${COMPILER}" --skip_verification | true
-	      done
-	  done
+		for DTYPE in float32 bfloat16
+		do
+		for BS in 0001 0032 0128
+		do
+			for name in "${CNNS[@]}"
+			do
+				echo "Benchmark $name with BS=$BS and DTYPE=$DTYPE"
+				numactl -N 1 benchmark-run -b cnn -p "name='${name}',batch_size='$BS'" --dtype "${DTYPE}" --benchmark_desc "${name}_bs$BS" --host "${HOST}" -c "${COMPILER}" --skip_verification | true
+			done
+		done
+		done
 	done
-done
 
 
-LLMS=(gptj)
-for COMPILER in dynamo ipex
-do
-	for DTYPE in float32 bfloat16
+	LLMS=(gptj)
+	for COMPILER in dynamo ipex
 	do
-		for name in "${LLMS[@]}"
+		for DTYPE in float32 bfloat16
 		do
-			echo "Benchmark $name with DTYPE=$DTYPE"
-			numactl -N 1 benchmark-run -b llm -p "name='${name}'" --dtype "${DTYPE}" --benchmark_desc "${name}_bs$BS" --host "${HOST}" -c "${COMPILER}" --skip_verification
+			for name in "${LLMS[@]}"
+			do
+				echo "Benchmark $name with DTYPE=$DTYPE"
+				numactl -N 1 benchmark-run -b llm -p "name='${name}'" --dtype "${DTYPE}" --benchmark_desc "${name}_bs$BS" --host "${HOST}" -c "${COMPILER}" --skip_verification
+			done
 		done
 	done
 done
-

From 7f76f32df80ee54ab3a63b9b1caaec5fd2c1671d Mon Sep 17 00:00:00 2001
From: Egor Krivov <egor.krivov@intel.com>
Date: Thu, 15 Feb 2024 18:44:26 +0100
Subject: [PATCH 31/40] fixed bug

---
 scripts/margin.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/margin.sh b/scripts/margin.sh
index 347e862..d1dcdcb 100755
--- a/scripts/margin.sh
+++ b/scripts/margin.sh
@@ -21,7 +21,7 @@ do
 			for name in "${CNNS[@]}"
 			do
 				echo "Benchmark $name with BS=$BS and DTYPE=$DTYPE"
-				numactl -N 1 benchmark-run -b cnn -p "name='${name}',batch_size='$BS'" --dtype "${DTYPE}" --benchmark_desc "${name}_bs$BS" --host "${HOST}" -c "${COMPILER}" --skip_verification | true
+				numactl -N 1 benchmark-run -b cnn -p "name='${name}',batch_size='$BS'" --dtype "${DTYPE}" --benchmark_desc "${name}_bs$BS" --host "${HOST}" -c "${COMPILER}" --skip_verification
 			done
 		done
 		done

From dfd38aa0a5ef87ed3c883c4dd5a418126317e8d0 Mon Sep 17 00:00:00 2001
From: Egor Krivov <egor.krivov@intel.com>
Date: Fri, 23 Feb 2024 12:57:29 +0000
Subject: [PATCH 32/40] fixed ipex issue

---
 dl_bench/llm.py   | 7 ++++---
 dl_bench/utils.py | 2 +-
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/dl_bench/llm.py b/dl_bench/llm.py
index fb3873b..52082c0 100644
--- a/dl_bench/llm.py
+++ b/dl_bench/llm.py
@@ -3,7 +3,7 @@
 import math
 
 import torch
-import intel_extension_for_pytorch as ipex
+# import intel_extension_for_pytorch as ipex
 import numpy as np
 from transformers import (
     AutoModelForCausalLM,
@@ -76,14 +76,15 @@ def inference(self, backend):
         # self.flops_per_sample = get_macs(self.model, self.in_shape, backend) * 2
         self.model = backend.prepare_eval_transformer(self.model)
 
-        self.model.eval()
         enabled = backend.dtype != torch.float32
 
         n_items = 0
         outputs = []
         fw_times = []
 
-        self.model.eval()
+
+        # Ipex gives error with eval, other backends have no effect
+        # self.model.eval()
         for i in range(self.n_iter):
             print(f"Epoch {i+1}/{self.n_iter}")
             cast = torch.autocast(enabled=enabled, device_type=backend.device_name)
diff --git a/dl_bench/utils.py b/dl_bench/utils.py
index 2711843..e269fe7 100644
--- a/dl_bench/utils.py
+++ b/dl_bench/utils.py
@@ -132,7 +132,7 @@ def prepare_eval_transformer(self, model):
         model = model.to(memory_format=torch.channels_last)
 
         model.to(self.device)
-        with torch.inference_mode():
+        with torch.no_grad():
             model.eval()
             return self._compile_transformer_model(
                 self.compile_mode, model, dtype=self.dtype

From f8b8f28ae3839bedad3491cd1fde8624726def91 Mon Sep 17 00:00:00 2001
From: Egor Krivov <egor.krivov@intel.com>
Date: Fri, 23 Feb 2024 12:57:54 +0000
Subject: [PATCH 33/40] impoved ipex

---
 dl_bench/llm.py         |  1 -
 scripts/margin.sh       | 19 +++++++++++--------
 scripts/margin_setup.sh |  1 +
 3 files changed, 12 insertions(+), 9 deletions(-)
 mode change 100644 => 100755 scripts/margin_setup.sh

diff --git a/dl_bench/llm.py b/dl_bench/llm.py
index 52082c0..8bcc38b 100644
--- a/dl_bench/llm.py
+++ b/dl_bench/llm.py
@@ -3,7 +3,6 @@
 import math
 
 import torch
-# import intel_extension_for_pytorch as ipex
 import numpy as np
 from transformers import (
     AutoModelForCausalLM,
diff --git a/scripts/margin.sh b/scripts/margin.sh
index d1dcdcb..87faed7 100755
--- a/scripts/margin.sh
+++ b/scripts/margin.sh
@@ -12,11 +12,11 @@ export ONEDNN_VERBOSE=0
 for i in 1 2 3 4 5 6 7
 do
 	CNNS=(resnet50)
-	for COMPILER in dynamo ipex_onednn_graph
+	for COMPILER in ipex_onednn_graph
 	do
-		for DTYPE in float32 bfloat16
+		for DTYPE in bfloat16
 		do
-		for BS in 0001 0032 0128
+		for BS in 0001 0016 0032 0064 0128
 		do
 			for name in "${CNNS[@]}"
 			do
@@ -28,15 +28,18 @@ do
 	done
 
 
-	LLMS=(gptj)
+	LLMS=(gptj llama2-7b)
 	for COMPILER in dynamo ipex
 	do
-		for DTYPE in float32 bfloat16
+		for BS in 0001 0004 0008
 		do
-			for name in "${LLMS[@]}"
+			for DTYPE in bfloat16
 			do
-				echo "Benchmark $name with DTYPE=$DTYPE"
-				numactl -N 1 benchmark-run -b llm -p "name='${name}'" --dtype "${DTYPE}" --benchmark_desc "${name}_bs$BS" --host "${HOST}" -c "${COMPILER}" --skip_verification
+				for name in "${LLMS[@]}"
+				do
+					echo "Benchmark $name with DTYPE=$DTYPE"
+					numactl -N 1 benchmark-run -b llm -p "name='${name}',batch_size=${BS}" --dtype "${DTYPE}" --benchmark_desc "${name}_bs$BS" --host "${HOST}" -c "${COMPILER}" --skip_verification
+				done
 			done
 		done
 	done
diff --git a/scripts/margin_setup.sh b/scripts/margin_setup.sh
old mode 100644
new mode 100755
index 0fb7948..3482f57
--- a/scripts/margin_setup.sh
+++ b/scripts/margin_setup.sh
@@ -16,6 +16,7 @@ conda activate margin
 # Install ipex & pytorch
 python -m pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
 python -m pip install intel-extension-for-pytorch
+python -m pip install transformers==4.35.2
 python -m pip install oneccl_bind_pt --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/cpu/us/
 
 # Install benchmarks

From 4849023452cebc6198eec8b5687ad30df9c7210b Mon Sep 17 00:00:00 2001
From: Egor Krivov <egor.krivov@intel.com>
Date: Fri, 23 Feb 2024 13:02:48 +0000
Subject: [PATCH 34/40] removed accidental file

---
 dl_bench/results.db | Bin 8192 -> 0 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 dl_bench/results.db

diff --git a/dl_bench/results.db b/dl_bench/results.db
deleted file mode 100644
index 064d103a8d029f1d275998ec039bdd255aaf11d3..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 8192
zcmeI0%WB&|6oyA8U}6Xjq$aDvAPBL+DY~>FID|%Vl+>+kr?T9V)pRV4V>OZ`q;X4I
z3N2)jZ6Bae=(fu~Rp|?~tB!<wF~~_C;2cJpng9HA;oF_*Ke2qy@F+?`%CU}CktCsg
zj1fYm*(bU@B=KLGjk7MBUl%2Gcl3QlxTKX&NHho_00e*l5C8%|00;m9AOHk_01zl5
zaPeF!uj{&WdB*85U_46PaTxds8M28O(d4{9Dz$9Wa7^qN2bPHo47gTV_C4J0I_9xy
z<8IHv-GOB_aAnz}q82<9tDSbo#E0gQF|Zt5YYlAM>^dY<`;O5$sn4zA^b9{W?AAlW
zu4#%gpZt4O;<41df$s(u64Qi+DSp)Nb?0s}r0zMJc-I^?j#9q(6py|1UA8#yMq%s+
zELrUE`0MzJ4PMbCoW>-@M^?`e?EmLKO(^%HNntXjVH~iO#4IVykAf(kWpii2{wH?3
zW7wzov3ZJXzE`i*^V<B@F%k^|2mk>f00e*l5C8%|00;m9AOHk_01&vEz=v&V73tN^
z+Oys2+WMVxxxAz)n!crKTgnbrm8PaPl^xDf9tJU)(2(tIrv4jtf3q>9+#Qo_vZpHA
zJ&}JaX9l^6<=i4Su>8*?<jnRi+l#pIl;kfwWGJeC%GK>m7HTg3(H{Kjy{L-&@7I5R
T`|(-ymtWq0yh!AYueZ@};5GDw


From b0e39c4c24433a5d51441e21bf95493eee298a6c Mon Sep 17 00:00:00 2001
From: Egor Krivov <egor.krivov@intel.com>
Date: Fri, 23 Feb 2024 13:22:15 +0000
Subject: [PATCH 35/40] update

---
 scripts/margin.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/margin.sh b/scripts/margin.sh
index 87faed7..8c274f1 100755
--- a/scripts/margin.sh
+++ b/scripts/margin.sh
@@ -29,7 +29,7 @@ do
 
 
 	LLMS=(gptj llama2-7b)
-	for COMPILER in dynamo ipex
+	for COMPILER in ipex
 	do
 		for BS in 0001 0004 0008
 		do

From 41d4661ae9fff6386198a5caa258fd3c16d99a49 Mon Sep 17 00:00:00 2001
From: Egor Krivov <egor.krivov@intel.com>
Date: Fri, 23 Feb 2024 14:32:09 +0000
Subject: [PATCH 36/40] fixed bugs

---
 dl_bench/llm.py   | 2 +-
 scripts/margin.sh | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/dl_bench/llm.py b/dl_bench/llm.py
index 8bcc38b..5972804 100644
--- a/dl_bench/llm.py
+++ b/dl_bench/llm.py
@@ -26,7 +26,7 @@ def get_llm(name, dtype):
 
     kwargs = {}
     if name.startswith("llama2") and "HF_TOKEN" in os.environ:
-        kwargs = {"HF_TOKEN": os.environ.get("HF_TOKEN")}
+        kwargs = {"token": os.environ.get("HF_TOKEN")}
 
     model_name, M, T = name2params[name]
 
diff --git a/scripts/margin.sh b/scripts/margin.sh
index 8c274f1..15d6eab 100755
--- a/scripts/margin.sh
+++ b/scripts/margin.sh
@@ -31,7 +31,7 @@ do
 	LLMS=(gptj llama2-7b)
 	for COMPILER in ipex
 	do
-		for BS in 0001 0004 0008
+		for BS in 1 4 8
 		do
 			for DTYPE in bfloat16
 			do

From 98ea94c04767c58ef0d816a3c9f2ffe9a1ff7400 Mon Sep 17 00:00:00 2001
From: Egor <egor.krivov@intel.com>
Date: Mon, 26 Feb 2024 14:04:08 +0100
Subject: [PATCH 37/40] Changed ipex from ipex-xpu -> ipex-cpu (#90)

---
 tests/conda-envs/ipex-xpu.yaml | 14 ++++++++++++++
 tests/conda-envs/ipex.yaml     | 14 +++++++++++---
 2 files changed, 25 insertions(+), 3 deletions(-)
 create mode 100644 tests/conda-envs/ipex-xpu.yaml

diff --git a/tests/conda-envs/ipex-xpu.yaml b/tests/conda-envs/ipex-xpu.yaml
new file mode 100644
index 0000000..d136704
--- /dev/null
+++ b/tests/conda-envs/ipex-xpu.yaml
@@ -0,0 +1,14 @@
+name: ipex
+channels:
+  - intel
+  - conda-forge
+dependencies:
+  - intel-aikit-pytorch
+  - pytorch>=2.0.1=*_xpu_*
+  - intel-extension-for-pytorch
+  - datasets
+  - accelerate
+  - sentencepiece
+# The following packages are required to run benchmarks
+  - sqlalchemy>=2.0.0
+  - pytest
diff --git a/tests/conda-envs/ipex.yaml b/tests/conda-envs/ipex.yaml
index d136704..9691bde 100644
--- a/tests/conda-envs/ipex.yaml
+++ b/tests/conda-envs/ipex.yaml
@@ -3,12 +3,20 @@ channels:
   - intel
   - conda-forge
 dependencies:
-  - intel-aikit-pytorch
-  - pytorch>=2.0.1=*_xpu_*
-  - intel-extension-for-pytorch
+  - python=3.11
   - datasets
   - accelerate
   - sentencepiece
 # The following packages are required to run benchmarks
   - sqlalchemy>=2.0.0
   - pytest
+  - pip
+  - pip:
+    - --extra-index-url https://download.pytorch.org/whl/cpu
+    - torch
+    - torchvision
+    - torchaudio
+    - transformers==4.35.2
+    - intel-extension-for-pytorch
+    - --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/cpu/us/
+    - oneccl_bind_pt

From a6a39a6f84dd3f80ddd8cc4c051986f2c5561844 Mon Sep 17 00:00:00 2001
From: Egor Krivov <egor.krivov@intel.com>
Date: Tue, 27 Feb 2024 16:51:23 +0000
Subject: [PATCH 38/40] updated exp

---
 scripts/margin.sh | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/scripts/margin.sh b/scripts/margin.sh
index 15d6eab..d5c9138 100755
--- a/scripts/margin.sh
+++ b/scripts/margin.sh
@@ -14,14 +14,14 @@ do
 	CNNS=(resnet50)
 	for COMPILER in ipex_onednn_graph
 	do
-		for DTYPE in bfloat16
+		for DTYPE in float32 bfloat16
 		do
-		for BS in 0001 0016 0032 0064 0128
+		for BS in 0001 0032 
 		do
 			for name in "${CNNS[@]}"
 			do
 				echo "Benchmark $name with BS=$BS and DTYPE=$DTYPE"
-				numactl -N 1 benchmark-run -b cnn -p "name='${name}',batch_size='$BS'" --dtype "${DTYPE}" --benchmark_desc "${name}_bs$BS" --host "${HOST}" -c "${COMPILER}" --skip_verification
+				numactl -m 0 --physcpubind=0-31 benchmark-run -b cnn -p "name='${name}',batch_size='$BS'" --dtype "${DTYPE}" --benchmark_desc "${name}_bs$BS" --host "${HOST}" -c "${COMPILER}" --skip_verification
 			done
 		done
 		done
@@ -31,14 +31,14 @@ do
 	LLMS=(gptj llama2-7b)
 	for COMPILER in ipex
 	do
-		for BS in 1 4 8
+		for BS in 1 8
 		do
 			for DTYPE in bfloat16
 			do
 				for name in "${LLMS[@]}"
 				do
 					echo "Benchmark $name with DTYPE=$DTYPE"
-					numactl -N 1 benchmark-run -b llm -p "name='${name}',batch_size=${BS}" --dtype "${DTYPE}" --benchmark_desc "${name}_bs$BS" --host "${HOST}" -c "${COMPILER}" --skip_verification
+					numactl -m 0 --physcpubind=0-31 benchmark-run -b llm -p "name='${name}',batch_size=${BS}" --dtype "${DTYPE}" --benchmark_desc "${name}_bs$BS" --host "${HOST}" -c "${COMPILER}" --skip_verification
 				done
 			done
 		done

From e983d51753bced1f428a7bdaf151e6f538e48587 Mon Sep 17 00:00:00 2001
From: Egor Krivov <egor.krivov@intel.com>
Date: Wed, 28 Feb 2024 14:45:56 +0000
Subject: [PATCH 39/40] added each4 benchmarkx

---
 scripts/margin2.sh | 62 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 62 insertions(+)
 create mode 100755 scripts/margin2.sh

diff --git a/scripts/margin2.sh b/scripts/margin2.sh
new file mode 100755
index 0000000..87efd63
--- /dev/null
+++ b/scripts/margin2.sh
@@ -0,0 +1,62 @@
+#!/bin/bash
+
+set -x
+
+HOST="test"
+
+export KMP_AFFINITY="respect,noreset,granularity=fine,balanced"
+export OMP_NUM_THREADS=4
+echo "Cores configured $OMP_NUM_THREADS"
+export ONEDNN_VERBOSE=0
+
+for i in 1 2 3 4 5 6 7
+do
+	CNNS=(resnet50)
+	for COMPILER in ipex_onednn_graph
+	do
+		for DTYPE in float32 bfloat16
+		do
+		for BS in 0001
+		do
+			for name in "${CNNS[@]}"
+			do
+				echo "Benchmark $name with BS=$BS and DTYPE=$DTYPE"
+				export BENCH_COMMAND="benchmark-run -b cnn -p name='${name}',batch_size='$BS' --dtype ${DTYPE} --benchmark_desc ${name}_bs${BS}each4 --host ${HOST} -c ${COMPILER} --skip_verification"
+				numactl -m 0 --physcpubind=4-7 $BENCH_COMMAND &
+				numactl -m 0 --physcpubind=8-11 $BENCH_COMMAND &
+				numactl -m 0 --physcpubind=12-15 $BENCH_COMMAND &
+				numactl -m 0 --physcpubind=16-19 $BENCH_COMMAND &
+				numactl -m 0 --physcpubind=20-23 $BENCH_COMMAND &
+				numactl -m 0 --physcpubind=24-27 $BENCH_COMMAND &
+				numactl -m 0 --physcpubind=28-31 $BENCH_COMMAND &
+				wait $(jobs -p)
+			done
+		done
+		done
+	done
+
+
+	LLMS=(gptj llama2-7b)
+	for COMPILER in ipex
+	do
+		for BS in 1
+		do
+			for DTYPE in bfloat16
+			do
+				for name in "${LLMS[@]}"
+				do
+					echo "Benchmark $name with DTYPE=$DTYPE"
+					export BENCH_COMMAND="benchmark-run -b llm -p name='${name}',batch_size='$BS' --dtype ${DTYPE} --benchmark_desc ${name}_bs${BS}each4 --host ${HOST} -c ${COMPILER} --skip_verification"
+					numactl -m 0 --physcpubind=4-7 $BENCH_COMMAND &
+					numactl -m 0 --physcpubind=8-11 $BENCH_COMMAND &
+					numactl -m 0 --physcpubind=12-15 $BENCH_COMMAND &
+					numactl -m 0 --physcpubind=16-19 $BENCH_COMMAND &
+					numactl -m 0 --physcpubind=20-23 $BENCH_COMMAND &
+					numactl -m 0 --physcpubind=24-27 $BENCH_COMMAND &
+					numactl -m 0 --physcpubind=28-31 $BENCH_COMMAND &
+					wait $(jobs -p)
+					done
+			done
+		done
+	done
+done

From 3aba74a85de4603a80922cb9dd08b775c9ec57db Mon Sep 17 00:00:00 2001
From: Egor Krivov <egor.krivov@intel.com>
Date: Wed, 28 Feb 2024 15:13:28 +0000
Subject: [PATCH 40/40] fixed bug

---
 scripts/margin2.sh | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/scripts/margin2.sh b/scripts/margin2.sh
index 87efd63..76c6aee 100755
--- a/scripts/margin2.sh
+++ b/scripts/margin2.sh
@@ -22,6 +22,7 @@ do
 			do
 				echo "Benchmark $name with BS=$BS and DTYPE=$DTYPE"
 				export BENCH_COMMAND="benchmark-run -b cnn -p name='${name}',batch_size='$BS' --dtype ${DTYPE} --benchmark_desc ${name}_bs${BS}each4 --host ${HOST} -c ${COMPILER} --skip_verification"
+				numactl -m 0 --physcpubind=0-3 $BENCH_COMMAND &
 				numactl -m 0 --physcpubind=4-7 $BENCH_COMMAND &
 				numactl -m 0 --physcpubind=8-11 $BENCH_COMMAND &
 				numactl -m 0 --physcpubind=12-15 $BENCH_COMMAND &
@@ -29,6 +30,12 @@ do
 				numactl -m 0 --physcpubind=20-23 $BENCH_COMMAND &
 				numactl -m 0 --physcpubind=24-27 $BENCH_COMMAND &
 				numactl -m 0 --physcpubind=28-31 $BENCH_COMMAND &
+				numactl -m 0 --physcpubind=32-35 $BENCH_COMMAND &
+				numactl -m 0 --physcpubind=36-39 $BENCH_COMMAND &
+				numactl -m 0 --physcpubind=40-43 $BENCH_COMMAND &
+				numactl -m 0 --physcpubind=44-47 $BENCH_COMMAND &
+				numactl -m 0 --physcpubind=48-51 $BENCH_COMMAND &
+				numactl -m 0 --physcpubind=52-55 $BENCH_COMMAND &
 				wait $(jobs -p)
 			done
 		done
@@ -47,6 +54,7 @@ do
 				do
 					echo "Benchmark $name with DTYPE=$DTYPE"
 					export BENCH_COMMAND="benchmark-run -b llm -p name='${name}',batch_size='$BS' --dtype ${DTYPE} --benchmark_desc ${name}_bs${BS}each4 --host ${HOST} -c ${COMPILER} --skip_verification"
+					numactl -m 0 --physcpubind=0-3 $BENCH_COMMAND &
 					numactl -m 0 --physcpubind=4-7 $BENCH_COMMAND &
 					numactl -m 0 --physcpubind=8-11 $BENCH_COMMAND &
 					numactl -m 0 --physcpubind=12-15 $BENCH_COMMAND &
@@ -54,6 +62,12 @@ do
 					numactl -m 0 --physcpubind=20-23 $BENCH_COMMAND &
 					numactl -m 0 --physcpubind=24-27 $BENCH_COMMAND &
 					numactl -m 0 --physcpubind=28-31 $BENCH_COMMAND &
+					numactl -m 0 --physcpubind=32-35 $BENCH_COMMAND &
+					numactl -m 0 --physcpubind=36-39 $BENCH_COMMAND &
+					numactl -m 0 --physcpubind=40-43 $BENCH_COMMAND &
+					numactl -m 0 --physcpubind=44-47 $BENCH_COMMAND &
+					numactl -m 0 --physcpubind=48-51 $BENCH_COMMAND &
+					numactl -m 0 --physcpubind=52-55 $BENCH_COMMAND &
 					wait $(jobs -p)
 					done
 			done