From a7a6ce930e6ab2ca2f917356fb57e7ebc772d10e Mon Sep 17 00:00:00 2001
From: bolt <bolt.liu@arm.com>
Date: Tue, 20 May 2025 06:57:09 +0000
Subject: [PATCH 1/8] add GGML_USE_NUMA_MIGRATE feature to optimize cross NUMA
 op computation

---
 common/arg.cpp                          |   6 +
 ggml/CMakeLists.txt                     |   5 +
 ggml/include/ggml-backend.h             |   3 +
 ggml/include/ggml-cpu.h                 |   3 +
 ggml/src/CMakeLists.txt                 |  24 ++
 ggml/src/ggml-backend.cpp               | 312 ++++++++++++++++++++++++
 ggml/src/ggml-cpu/amx/amx.cpp           |   4 +
 ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp  |   8 +-
 ggml/src/ggml-cpu/ggml-cpu-impl.h       |   2 +
 ggml/src/ggml-cpu/ggml-cpu.c            | 182 +++++++++++++-
 ggml/src/ggml-cpu/kleidiai/kleidiai.cpp |   4 +
 src/llama-model-loader.cpp              |   4 +
 tools/llama-bench/llama-bench.cpp       |   9 +
 13 files changed, 561 insertions(+), 5 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index e2676bb878e28..e4592bfb847bf 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -2276,12 +2276,18 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         "- distribute: spread execution evenly over all nodes\n"
         "- isolate: only spawn threads on CPUs on the node that execution started on\n"
         "- numactl: use the CPU map provided by numactl\n"
+#ifdef GGML_USE_NUMA_MIGRATE
+        "- migrate: for affinity threads with page migration across NUMA nodes\n"
+#endif
         "if run without this previously, it is recommended to drop the system page cache before using this\n"
         "see https://github.com/ggml-org/llama.cpp/issues/1437",
         [](common_params & params, const std::string & value) {
             /**/ if (value == "distribute" || value == "") { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; }
             else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; }
             else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; }
+#ifdef GGML_USE_NUMA_MIGRATE
+            else if (value == "migrate") { params.numa = GGML_NUMA_STRATEGY_MIGRATE; }
+#endif
             else { throw std::invalid_argument("invalid value"); }
         }
     ).set_env("LLAMA_ARG_NUMA"));
diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt
index 4746d5cb76c08..dd2d91ed86799 100644
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -151,6 +151,11 @@ set(GGML_BLAS_VENDOR ${GGML_BLAS_VENDOR_DEFAULT} CACHE STRING
                                             "ggml: BLAS library vendor")
 option(GGML_LLAMAFILE                       "ggml: use LLAMAFILE"                             ${GGML_LLAMAFILE_DEFAULT})
 
+option(GGML_NUMA_MIGRATE                    "ggml: use NUMA_MIGRATE"                          OFF)
+set(GGML_NUMA_MIGRATE_NODES "2" CACHE STRING
+                                            "ggml: the number of NUMA nodes during page migration")
+option(GGML_NUMA_MIGRATE_DEBUG              "ggml: enable debugging of NUMA_MIGRATE"          OFF)
+
 option(GGML_CUDA                            "ggml: use CUDA"                                  OFF)
 option(GGML_MUSA                            "ggml: use MUSA"                                  OFF)
 option(GGML_CUDA_FORCE_MMQ                  "ggml: use mmq kernels instead of cuBLAS"         OFF)
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
index 778927f68217a..6b02d5f24a54a 100644
--- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h
@@ -348,6 +348,9 @@ extern "C" {
     // CPU buffer types are always available
     GGML_API ggml_backend_buffer_t      ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
     GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
+#ifdef GGML_USE_NUMA_MIGRATE
+    GGML_API size_t ggml_backend_get_page_size(void);
+#endif
 
 #ifdef  __cplusplus
 }
diff --git a/ggml/include/ggml-cpu.h b/ggml/include/ggml-cpu.h
index de77a875ec533..54c24ec537cac 100644
--- a/ggml/include/ggml-cpu.h
+++ b/ggml/include/ggml-cpu.h
@@ -28,6 +28,9 @@ extern "C" {
         GGML_NUMA_STRATEGY_ISOLATE    = 2,
         GGML_NUMA_STRATEGY_NUMACTL    = 3,
         GGML_NUMA_STRATEGY_MIRROR     = 4,
+#ifdef GGML_USE_NUMA_MIGRATE
+        GGML_NUMA_STRATEGY_MIGRATE    = 5,
+#endif
         GGML_NUMA_STRATEGY_COUNT
     };
 
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index ddea5ad3891e5..6da09ff842d8b 100644
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -343,3 +343,27 @@ if (BUILD_SHARED_LIBS)
         target_compile_definitions(${target} PUBLIC  GGML_SHARED)
     endforeach()
 endif()
+
+if (GGML_NUMA_MIGRATE)
+    find_path(NUMA_ROOT_DIR
+        NAMES include/numa.h
+        PATHS ENV NUMA_ROOT
+        DOC "NUMA root directory")
+
+    find_library(NUMA_LIBRARY
+        NAMES numa
+        HINTS ${NUMA_ROOT_DIR}
+        DOC "NUMA library")
+
+    if (NOT NUMA_LIBRARY)
+        message(FATAL_ERROR "Could NOT find NUMA library.")
+    endif()
+
+    if (GGML_NUMA_MIGRATE_DEBUG)
+        target_compile_definitions(ggml-base PUBLIC GGML_USE_NUMA_MIGRATE GGML_NUMA_MIGRATE_NODES=${GGML_NUMA_MIGRATE_NODES} GGML_USE_NUMA_MIGRATE_DEBUG)
+    else()
+        target_compile_definitions(ggml-base PUBLIC GGML_USE_NUMA_MIGRATE GGML_NUMA_MIGRATE_NODES=${GGML_NUMA_MIGRATE_NODES})
+    endif()
+
+    target_link_libraries(ggml-base PRIVATE ${NUMA_LIBRARY})
+endif()
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
index b30b4cb386f9f..fd31530becf2c 100644
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@@ -22,12 +22,51 @@
 #include <string>
 #include <vector>
 #include <algorithm>
+#include <numa.h>
+#include <execinfo.h>
+#include <iostream>
+#include <cstdlib>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <numa.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include <numaif.h>
+#include <set>
+#include <mutex>
 
 #ifdef __APPLE__
 #include <sys/types.h>
 #include <sys/sysctl.h>
 #endif
 
+#ifdef GGML_USE_NUMA_MIGRATE
+class numa_migrate_mapping_cache {
+public:
+    void * addr;
+    int size;
+    numa_migrate_mapping_cache(void *addr, int size): addr(addr), size(size) { }
+
+    bool operator<(const numa_migrate_mapping_cache& other) const {
+        if (addr != other.addr) {
+            return addr < other.addr;
+        } else {
+            return size < other.size;
+        }
+    }
+
+    bool operator==(const numa_migrate_mapping_cache& other) const {
+        return (addr == other.addr && size == other.size);
+    }
+
+};
+
+static std::set<numa_migrate_mapping_cache> ggml_mapping_cache;
+static size_t ggml_backend_page_size = 0;
+static std::mutex ggml_mapping_mutex;
+#endif
 
 // backend buffer type
 
@@ -1658,6 +1697,244 @@ enum ggml_status ggml_backend_view_init(struct ggml_tensor * tensor) {
     return ggml_backend_buffer_init_tensor(tensor->buffer, tensor);
 }
 
+#ifdef GGML_USE_NUMA_MIGRATE
+#ifdef GGML_USE_NUMA_MIGRATE_DEBUG
+static int check_numa_pages_migration(void *addr, size_t total_size) {
+    if (total_size % ggml_backend_page_size != 0) {
+        return -1;
+    }
+
+    size_t offset = 0; // Offset in bytes from the start of the allocated memory
+    int num_nodes = GGML_NUMA_MIGRATE_NODES;
+
+    for (int i = 0; i < num_nodes; ++i) {
+        int target_node = i;
+        size_t size_to_migrate = total_size / num_nodes;
+
+        if (size_to_migrate > total_size - offset) {
+            GGML_LOG_ERROR(
+                "Error: Size to migrate to node %d exceeds remaining memory, "
+                "size_to_migrate: %ld, total: %ld\n",
+                target_node, size_to_migrate, total_size);
+            return -1;
+        }
+
+        size_t num_pages_to_migrate = size_to_migrate / ggml_backend_page_size;
+        if (size_to_migrate % ggml_backend_page_size != 0) {
+            GGML_LOG_WARN("Warning: Size to migrate to node %ld is not a "
+                          "multiple of page size, total: %ld size_to_migrate: "
+                          "%ld, ggml_backend_page_size: %ld.\n",
+                          target_node, total_size, size_to_migrate,
+                          ggml_backend_page_size);
+            return -1;
+        }
+
+        if (num_pages_to_migrate == 0) {
+            GGML_LOG_WARN("Warning: No pages to migrate to node %d.\n",
+                          target_node);
+            continue;
+        }
+
+        void *migrate_start_addr = (char *)addr + (i)*size_to_migrate;
+
+        int *status = (int *)malloc(num_pages_to_migrate * sizeof(int));
+        if (!status) {
+            GGML_LOG_ERROR("malloc for status failed");
+            return -1;
+        }
+        memset(status, 0, num_pages_to_migrate * sizeof(int));
+
+        int *nodes = (int *)malloc(num_pages_to_migrate * sizeof(int));
+        if (!nodes) {
+            GGML_LOG_ERROR("malloc for nodes failed");
+            return -1;
+        }
+        memset(nodes, 0, num_pages_to_migrate * sizeof(int));
+
+        void **addr_to_migrate =
+            (void **)malloc(num_pages_to_migrate * sizeof(void *));
+        for (size_t j = 0; j < num_pages_to_migrate; j++) {
+            status[j] = 0;
+            nodes[j] = i;
+            addr_to_migrate[j] = (void *)((char *)migrate_start_addr +
+                                          j * ggml_backend_page_size);
+        }
+
+        // check if pages are migrated
+        int ret = move_pages(0, num_pages_to_migrate, addr_to_migrate, NULL,
+                             status, MPOL_MF_MOVE);
+        if (ret < 0) {
+            GGML_LOG_ERROR("check pages failed");
+            free(status);
+            free(nodes);
+            return -1;
+        }
+
+        for (size_t j = 0; j < num_pages_to_migrate; ++j) {
+            if (status[j] != target_node) {
+                GGML_LOG_WARN("Warning: Page %zu migration status to node %d: "
+                              "%d, ret: %d, addr: %p\n",
+                              j, target_node, status[j], ret,
+                              addr_to_migrate[j]);
+                if (status[j] == -ENODEV) {
+                    GGML_LOG_ERROR(
+                        "   - Error: No such device (NUMA node problem)\n");
+                } else if (status[j] == -EPERM) {
+                    GGML_LOG_ERROR(
+                        "   - Error: Operation not permitted (permissions)\n");
+                } else if (status[j] == -ENOENT) {
+                    GGML_LOG_ERROR("   - Error: ENOENT\n");
+                } else if (status[j] == -EFAULT) {
+                    GGML_LOG_ERROR("   - Error: Bad address\n");
+                } else if (status[j] == -EINVAL) {
+                    GGML_LOG_ERROR("   - Error: Invalid argument\n");
+                } else if (status[j] == -ENOMEM) {
+                    GGML_LOG_ERROR("   - Error: Out of memory\n");
+                } else if (status[j] == -EACCES) {
+                    GGML_LOG_ERROR("   - Error: access\n");
+                } else if (status[j] == -ESRCH) {
+                    GGML_LOG_ERROR("   - Error: access\n");
+                } else {
+                    GGML_LOG_ERROR("   - Error: Unknown status code at j: %ld: "
+                                   "%d, total_size: %ld\n",
+                                   j, status[j], total_size);
+                }
+
+                exit(0);
+                return -1;
+            }
+        }
+
+        free(status);
+        free(nodes);
+        free(addr_to_migrate);
+
+        offset += size_to_migrate;
+    }
+
+    GGML_LOG_INFO(
+        "page migration check passed at %p, size: %ld, num nodes: %d\n", addr,
+        total_size, num_nodes);
+    return 0;
+}
+#endif
+
+// Function to migrate pages to multiple NUMA nodes.
+static int migrate_pages_multiple_nodes(void *addr, size_t total_size) {
+
+    if (total_size % ggml_backend_page_size != 0) {
+        GGML_LOG_WARN("Warning: Total size is not a multiple of page size. "
+                      "Some memory may not be migrated.\n");
+        return -1;
+    }
+
+    size_t offset = 0; // Offset in bytes from the start of the allocated memory
+    int num_nodes = GGML_NUMA_MIGRATE_NODES;
+
+    for (int i = 0; i < num_nodes; ++i) {
+        int target_node = i;
+        size_t size_to_migrate = total_size / num_nodes;
+
+        if (size_to_migrate > total_size - offset) {
+            GGML_LOG_ERROR(
+                "Error: Size to migrate to node %d exceeds remaining memory, "
+                "size_to_migrate: %ld, total: %ld\n",
+                target_node, size_to_migrate, total_size);
+            return -1;
+        }
+
+        size_t num_pages_to_migrate = size_to_migrate / ggml_backend_page_size;
+        if (size_to_migrate % ggml_backend_page_size != 0) {
+#ifdef GGML_USE_NUMA_MIGRATE_DEBUG
+            GGML_LOG_WARN("Warning: Size to migrate to node %ld is not a "
+                          "multiple of page size, total: %ld size_to_migrate: "
+                          "%ld, ggml_backend_page_size: %ld.\n",
+                          target_node, total_size, size_to_migrate,
+                          ggml_backend_page_size);
+#endif
+            return -1;
+        }
+
+        if (num_pages_to_migrate == 0) {
+            GGML_LOG_WARN("Warning: No pages to migrate to node %d.\n",
+                          target_node);
+            continue;
+        }
+
+        void *migrate_start_addr = (char *)addr + (i)*size_to_migrate;
+
+        int *status = (int *)malloc(num_pages_to_migrate * sizeof(int));
+        if (!status) {
+            GGML_LOG_ERROR("malloc for status failed");
+            return -1;
+        }
+        memset(status, 0, num_pages_to_migrate * sizeof(int));
+
+        int *nodes = (int *)malloc(num_pages_to_migrate * sizeof(int));
+        if (!nodes) {
+            GGML_LOG_ERROR("malloc for nodes failed");
+            return -1;
+        }
+        memset(nodes, 0, num_pages_to_migrate * sizeof(int));
+
+        void **addr_to_migrate =
+            (void **)malloc(num_pages_to_migrate * sizeof(void *));
+        for (size_t j = 0; j < num_pages_to_migrate; j++) {
+            status[j] = 0;
+            nodes[j] = i;
+            addr_to_migrate[j] = (void *)((char *)migrate_start_addr +
+                                          j * ggml_backend_page_size);
+        }
+
+        int ret = move_pages(0, num_pages_to_migrate, addr_to_migrate, nodes,
+                             status, MPOL_MF_MOVE);
+        if (ret < 0) {
+            GGML_LOG_ERROR("move_pages failed");
+            free(status);
+            free(nodes);
+            return -1;
+        }
+
+        free(status);
+        free(nodes);
+        free(addr_to_migrate);
+
+        offset += size_to_migrate;
+    }
+
+    return 0;
+}
+
+static void migrate_pages_with_cache(void *addr, size_t size,
+                                     bool force_memset) {
+    if (size >= GGML_NUMA_MIGRATE_NODES * ggml_backend_page_size) {
+        numa_migrate_mapping_cache current_addr(addr, size);
+        std::lock_guard<std::mutex> lock(ggml_mapping_mutex);
+        auto it = ggml_mapping_cache.find(current_addr);
+        if (it == ggml_mapping_cache.end()) {
+            GGML_ASSERT(((uint64_t)(addr) & (ggml_backend_page_size - 1)) == 0);
+            int num_pages =
+                size / ggml_backend_page_size / GGML_NUMA_MIGRATE_NODES;
+            if (num_pages && ((size % ggml_backend_page_size) == 0)) {
+                if (force_memset) {
+                    memset(addr, 0, size); // force to allocate memory
+                }
+                if (migrate_pages_multiple_nodes(addr, size) != 0) {
+                    GGML_LOG_DEBUG("Migration to multiple nodes failed, addr: "
+                                   "%p, size: %ld\n",
+                                   addr, size);
+                } else {
+#ifdef GGML_USE_NUMA_MIGRATE_DEBUG
+                    check_numa_pages_migration(addr, size);
+#endif
+                }
+                ggml_mapping_cache.insert(current_addr);
+            }
+        }
+    }
+}
+#endif
+
 enum ggml_status ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr) {
     GGML_ASSERT(tensor->buffer == NULL);
     GGML_ASSERT(tensor->data == NULL);
@@ -1668,6 +1945,11 @@ enum ggml_status ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct
 
     tensor->buffer = buffer;
     tensor->data = addr;
+
+#ifdef GGML_USE_NUMA_MIGRATE
+    size_t size = ggml_backend_buffer_get_alloc_size(buffer, tensor);
+    migrate_pages_with_cache(tensor->data, size, true);
+#endif
     return ggml_backend_buffer_init_tensor(buffer, tensor);
 }
 
@@ -1861,16 +2143,28 @@ bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t
 static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
     uintptr_t data = (uintptr_t)buffer->context;
 
+#ifdef GGML_USE_NUMA_MIGRATE
+    // align the buffer
+    if (data % ggml_backend_page_size != 0) {
+        data = GGML_PAD(data, ggml_backend_page_size);
+    }
+#else
     // align the buffer
     if (data % TENSOR_ALIGNMENT != 0) {
         data = GGML_PAD(data, TENSOR_ALIGNMENT);
     }
+#endif
 
     return (void *)data;
 }
 
 static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+#ifdef GGML_USE_NUMA_MIGRATE
+    numa_free(buffer->context, buffer->size);
+#else
     ggml_aligned_free(buffer->context, buffer->size);
+#endif
+
 }
 
 static void ggml_backend_cpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
@@ -1939,8 +2233,22 @@ static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_ty
     GGML_UNUSED(buft);
 }
 
+#ifdef GGML_USE_NUMA_MIGRATE
+size_t ggml_backend_get_page_size(void) {
+    if (ggml_backend_page_size == 0) {
+        ggml_backend_page_size = sysconf(_SC_PAGE_SIZE);
+    }
+    return ggml_backend_page_size;
+}
+#endif
+
 static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
+#ifdef GGML_USE_NUMA_MIGRATE
+    ggml_backend_get_page_size();
+    void * data = numa_alloc_onnode(size, 0);
+#else
     void * data = ggml_aligned_malloc(size);
+#endif
 
     if (data == NULL) {
         GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__, size);
@@ -1951,7 +2259,11 @@ static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_back
 }
 
 static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
+#ifdef GGML_USE_NUMA_MIGRATE
+    return ggml_backend_get_page_size();
+#else
     return TENSOR_ALIGNMENT;
+#endif
 
     GGML_UNUSED(buft);
 }
diff --git a/ggml/src/ggml-cpu/amx/amx.cpp b/ggml/src/ggml-cpu/amx/amx.cpp
index 0f067137df006..222d8095cae74 100644
--- a/ggml/src/ggml-cpu/amx/amx.cpp
+++ b/ggml/src/ggml-cpu/amx/amx.cpp
@@ -133,7 +133,11 @@ static ggml_backend_buffer_t ggml_backend_amx_buffer_type_alloc_buffer(ggml_back
 }
 
 static size_t ggml_backend_amx_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
+#ifdef GGML_USE_NUMA_MIGRATE
+    return ggml_backend_get_page_size();
+#else
     return TENSOR_ALIGNMENT;
+#endif
 
     GGML_UNUSED(buft);
 }
diff --git a/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp b/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp
index 8ff6d64a4d0d1..e5c1ccc2de9a0 100644
--- a/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp
+++ b/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp
@@ -6111,7 +6111,7 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
             from_float((float *) ((char *) src1->data + i11 * nb11), (void *) (wdata + i11 * nbw1), ne10);
         }
 
-        ggml_barrier(params->threadpool);
+        ggml_barrier_numa_aware(params->threadpool, ith, GGML_BARRIER_NODE_LAST);
 
         const void * src1_wdata      = params->wdata;
         const size_t src1_col_stride = ggml_row_size(PARAM_TYPE, ne10);
@@ -6219,7 +6219,7 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
             }
         }
 
-        ggml_barrier(params->threadpool);
+        ggml_barrier_numa_aware(params->threadpool, ith, GGML_BARRIER_NODE_LAST);
 
         // compute each matrix multiplication in sequence
         for (int cur_a = 0; cur_a < n_as; ++cur_a) {
@@ -6359,7 +6359,11 @@ static ggml_backend_buffer_t ggml_backend_cpu_aarch64_buffer_type_alloc_buffer(g
 }
 
 static size_t ggml_backend_cpu_aarch64_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
+#ifdef GGML_USE_NUMA_MIGRATE
+    return ggml_backend_get_page_size();
+#else
     return TENSOR_ALIGNMENT;
+#endif
 
     GGML_UNUSED(buft);
 }
diff --git a/ggml/src/ggml-cpu/ggml-cpu-impl.h b/ggml/src/ggml-cpu/ggml-cpu-impl.h
index e4af07635c157..486f11a2b582a 100644
--- a/ggml/src/ggml-cpu/ggml-cpu-impl.h
+++ b/ggml/src/ggml-cpu/ggml-cpu-impl.h
@@ -506,6 +506,8 @@ static __m256 __lasx_xvreplfr2vr_s(const float val) {
 
 // TODO: move to ggml-threading
 void ggml_barrier(struct ggml_threadpool * tp);
+#define GGML_BARRIER_NODE_LAST      -1
+void ggml_barrier_numa_aware(struct ggml_threadpool * tp, int ith, int node_n);
 
 #ifdef __cplusplus
 }
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index 133b50606bcd1..e2e552a6a0681 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -34,6 +34,7 @@
 #include <limits.h>
 #include <stdarg.h>
 #include <signal.h>
+#include <numa.h>
 #if defined(__gnu_linux__)
 #include <syscall.h>
 #endif
@@ -440,6 +441,13 @@ struct ggml_threadpool {
     atomic_int n_graph;       // incremented when there is work to be done (i.e each graph)
     atomic_int GGML_CACHE_ALIGN n_barrier;
     atomic_int GGML_CACHE_ALIGN n_barrier_passed;
+
+#ifdef GGML_USE_NUMA_MIGRATE
+    atomic_int GGML_CACHE_ALIGN *n_barrier_node[GGML_NUMA_MIGRATE_NODES];
+    atomic_int GGML_CACHE_ALIGN *n_barrier_passed_node[GGML_NUMA_MIGRATE_NODES];
+    atomic_int GGML_CACHE_ALIGN *n_barrier_passed_last;
+#endif
+
     atomic_int GGML_CACHE_ALIGN current_chunk; // currently processing chunk during Mat_Mul, shared between all the threads.
 
     // these are atomic as an annotation for thread-sanitizer
@@ -505,6 +513,10 @@ struct ggml_numa_nodes {
 #else
     uint32_t cpuset; // no NUMA support outside of Linux at this time. Use a portable datatype
 #endif
+
+#ifdef GGML_USE_NUMA_MIGRATE
+    bool even_distributed;
+#endif
 };
 
 //
@@ -555,6 +567,74 @@ void ggml_barrier(struct ggml_threadpool * tp) {
 #endif
 }
 
+#ifdef GGML_USE_NUMA_MIGRATE
+static int get_node_from_cpu(int cpu, int cores_per_numa) {
+    return cpu / cores_per_numa;
+}
+#endif
+
+void ggml_barrier_numa_aware(struct ggml_threadpool * tp, int ith, int node_n) {
+#ifndef GGML_USE_NUMA_MIGRATE
+    UNUSED(ith);
+    UNUSED(node_n);
+    ggml_barrier(tp);
+    return;
+#else
+    if ((g_state.numa.numa_strategy != GGML_NUMA_STRATEGY_MIGRATE) || !g_state.numa.even_distributed) {
+        ggml_barrier(tp);
+        return;
+    }
+
+    int n_threads = atomic_load_explicit(&tp->n_threads_cur, memory_order_relaxed);
+    if (n_threads == 1) {
+        return;
+    }
+
+    int cores_per_numa = g_state.numa.nodes[0].n_cpus;
+    int numa_nodes = n_threads / cores_per_numa;
+    int remaining_cores = n_threads % cores_per_numa;
+    if (numa_nodes <= 1 || remaining_cores) {
+        ggml_barrier(tp);
+        return;
+    }
+
+    if (node_n == GGML_BARRIER_NODE_LAST) {
+        node_n = tp->cgraph->n_nodes;
+    }
+
+    int node = get_node_from_cpu(ith, cores_per_numa);
+
+    int n_passed = atomic_load_explicit(tp->n_barrier_passed_node[node], memory_order_relaxed);
+
+    // enter barrier (full seq-cst fence)
+    int n_barrier = atomic_fetch_add_explicit(tp->n_barrier_node[node], 1, memory_order_seq_cst);
+
+    if (n_barrier == (cores_per_numa - 1)) {
+        // last thread of current numa node
+        atomic_store_explicit(tp->n_barrier_node[node], 0, memory_order_seq_cst);
+
+        int n_passed_node = atomic_fetch_add_explicit(&tp->n_barrier_passed_last[node_n], 1, memory_order_seq_cst);
+
+        if (n_passed_node == (numa_nodes - 1)) { // last numa node cpu
+            atomic_fetch_add_explicit(tp->n_barrier_passed_node[node], 1, memory_order_seq_cst);
+            atomic_store_explicit(&tp->n_barrier_passed_last[node_n], 0, memory_order_seq_cst);
+        } else {
+            while (atomic_load_explicit(&tp->n_barrier_passed_last[node_n], memory_order_relaxed)) {
+                ggml_thread_cpu_relax();
+            }
+            atomic_fetch_add_explicit(tp->n_barrier_passed_node[node], 1, memory_order_seq_cst);
+        }
+
+        return;
+    }
+
+    // wait for other threads
+    while (atomic_load_explicit(tp->n_barrier_passed_node[node], memory_order_seq_cst) == n_passed) {
+        ggml_thread_cpu_relax();
+    }
+#endif
+}
+
 #if defined(__gnu_linux__)
 static cpu_set_t ggml_get_numa_affinity(void) {
     cpu_set_t cpuset;
@@ -631,6 +711,10 @@ void ggml_numa_init(enum ggml_numa_strategy numa_flag) {
         struct ggml_numa_node * node = &g_state.numa.nodes[n];
         GGML_PRINT_DEBUG("CPUs on node %u:", n);
         node->n_cpus = 0;
+
+#ifdef GGML_USE_NUMA_MIGRATE
+        g_state.numa.even_distributed = true;
+#endif
         for (uint32_t c = 0; c < g_state.numa.total_cpus; ++c) {
             rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u/cpu%u", n, c);
             GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
@@ -640,6 +724,11 @@ void ggml_numa_init(enum ggml_numa_strategy numa_flag) {
             }
         }
         GGML_PRINT_DEBUG("\n");
+#ifdef GGML_USE_NUMA_MIGRATE
+        if ((n != 0) && (g_state.numa.nodes[n].n_cpus != g_state.numa.nodes[0].n_cpus)) {
+            g_state.numa.even_distributed = false;
+        }
+#endif
     }
 
     if (ggml_is_numa()) {
@@ -2070,6 +2159,30 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
 
 // Android's libc implementation "bionic" does not support setting affinity
 #if defined(__gnu_linux__)
+
+#ifdef GGML_USE_NUMA_MIGRATE
+static void set_numa_migrate_affinity(int core_no) {
+    // Check if the core number is valid
+    if (core_no < 0 || core_no >= (int)g_state.numa.total_cpus) {
+        printf("%s, Warn: core_no not between 0 and %d, failback.\n", __func__, g_state.numa.total_cpus);
+        return;
+    }
+
+    cpu_set_t cpuset;
+    CPU_ZERO(&cpuset); // Initialize the CPU set
+
+    CPU_SET(core_no, &cpuset); // Set the specified core
+
+    // Set the thread's CPU affinity
+    int result = pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset);
+    if (result != 0) {
+        printf("failed to set core_no affinity: %d\n", core_no);
+        perror("set_affinity");
+        exit (1);
+    }
+}
+#endif
+
 static void set_numa_thread_affinity(int thread_n) {
     if (!ggml_is_numa()) {
         return;
@@ -2095,6 +2208,11 @@ static void set_numa_thread_affinity(int thread_n) {
                 fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n",strerror(rv));
             }
             return;
+#ifdef GGML_USE_NUMA_MIGRATE
+        case GGML_NUMA_STRATEGY_MIGRATE:
+            set_numa_migrate_affinity(thread_n);
+            return;
+#endif
         default:
             return;
     }
@@ -2840,11 +2958,37 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
         /*.threadpool=*/ tp,
     };
 
+#ifdef GGML_USE_NUMA_MIGRATE_DEBUG
+    struct timespec t0, t1, t2, t3, t4;
+    long d12, d32, d43;
+    bool log_time = true;
+    int log_node_n = 0;
+    if (log_time) {
+        clock_gettime(CLOCK_MONOTONIC, &t0);
+    }
+#endif
+
     for (int node_n = 0; node_n < cgraph->n_nodes && atomic_load_explicit(&tp->abort, memory_order_relaxed) != node_n; node_n++) {
         struct ggml_tensor * node = cgraph->nodes[node_n];
-
+#ifdef GGML_USE_NUMA_MIGRATE_DEBUG
+        if ((node->op == GGML_OP_MUL_MAT)) {
+            log_node_n = node_n;
+            log_time = true;
+        } else {
+            log_time = false;
+        }
+        if (log_time) {
+            clock_gettime(CLOCK_MONOTONIC, &t1);
+        }
+#endif
         ggml_compute_forward(&params, node);
 
+#ifdef GGML_USE_NUMA_MIGRATE_DEBUG
+        if (log_time) {
+            clock_gettime(CLOCK_MONOTONIC, &t2);
+        }
+#endif
+
         if (state->ith == 0 && cplan->abort_callback &&
                 cplan->abort_callback(cplan->abort_callback_data)) {
             atomic_store_explicit(&tp->abort, node_n + 1, memory_order_relaxed);
@@ -2852,11 +2996,28 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
         }
 
         if (node_n + 1 < cgraph->n_nodes) {
-            ggml_barrier(state->threadpool);
+#ifdef GGML_USE_NUMA_MIGRATE_DEBUG
+            if (log_time) {
+                clock_gettime(CLOCK_MONOTONIC, &t3);
+            }
+#endif
+
+            ggml_barrier_numa_aware(state->threadpool, state->ith, node_n);
+
+#ifdef GGML_USE_NUMA_MIGRATE_DEBUG
+            if (log_time) {
+                clock_gettime(CLOCK_MONOTONIC, &t4);
+                d12 = (t2.tv_sec - t1.tv_sec) * 1e9 + (t2.tv_nsec - t1.tv_nsec);
+                d32 = (t3.tv_sec - t2.tv_sec) * 1e9 + (t3.tv_nsec - t2.tv_nsec);
+                d43 = (t4.tv_sec - t3.tv_sec) * 1e9 + (t4.tv_nsec - t3.tv_nsec);
+                printf("%s, op: %d, ith: %d, cpu: %d, d12: %ld, d32: %ld, d43: %ld, t1: %ld, t2: %ld, t3: %ld, t4: %ld\n", \
+                    __func__, node->op, state->ith, sched_getcpu(), d12, d32, d43, t1.tv_nsec, t2.tv_nsec, t3.tv_nsec, t4.tv_nsec);
+            }
+#endif
         }
     }
 
-    ggml_barrier(state->threadpool);
+    ggml_barrier_numa_aware(state->threadpool, state->ith, GGML_BARRIER_NODE_LAST);
 
     return 0;
 }
@@ -3021,6 +3182,21 @@ static struct ggml_threadpool * ggml_threadpool_new_impl(
         threadpool->n_graph          = 0;
         threadpool->n_barrier        = 0;
         threadpool->n_barrier_passed = 0;
+
+#ifdef GGML_USE_NUMA_MIGRATE
+        for (int node = 0; node < GGML_NUMA_MIGRATE_NODES; node++) {
+            threadpool->n_barrier_node[node] = (atomic_int *)numa_alloc_onnode(sizeof(atomic_int), node);
+            *threadpool->n_barrier_node[node] = 0;
+            threadpool->n_barrier_passed_node[node] = (atomic_int *)numa_alloc_onnode(sizeof(atomic_int), node);
+            *threadpool->n_barrier_passed_node[node] = 0;
+        }
+
+        threadpool->n_barrier_passed_last = (atomic_int *)malloc((threadpool->cgraph->n_nodes + 1) * sizeof(atomic_int));
+        for (int i = 0; i < threadpool->cgraph->n_nodes + 1; i++) {
+            threadpool->n_barrier_passed_last[i] = 0;
+        }
+#endif
+
         threadpool->current_chunk    = 0;
         threadpool->stop             = false;
         threadpool->pause            = tpp->paused;
diff --git a/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp b/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp
index 15f0cd1540686..27f3d1130e4e6 100644
--- a/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp
+++ b/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp
@@ -413,7 +413,11 @@ static ggml_backend_buffer_t ggml_backend_cpu_kleidiai_buffer_type_alloc_buffer(
 }
 
 static size_t ggml_backend_cpu_kleidiai_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
+#ifdef GGML_USE_NUMA_MIGRATE
+    return ggml_backend_get_page_size();
+#else
     return TENSOR_ALIGNMENT;
+#endif
 
     GGML_UNUSED(buft);
 }
diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
index ddb1b03675b28..e32700f2fc9b4 100644
--- a/src/llama-model-loader.cpp
+++ b/src/llama-model-loader.cpp
@@ -686,6 +686,10 @@ llama_model_loader::llama_model_loader(
         use_mmap = false;
     }
 
+#ifdef GGML_USE_NUMA_MIGRATE
+    use_mmap = false;
+#endif
+
     this->use_mmap = use_mmap;
     this->check_tensors = check_tensors;
 }
diff --git a/tools/llama-bench/llama-bench.cpp b/tools/llama-bench/llama-bench.cpp
index 06196cf24fc89..81dbaeac4cf59 100644
--- a/tools/llama-bench/llama-bench.cpp
+++ b/tools/llama-bench/llama-bench.cpp
@@ -312,7 +312,12 @@ static void print_usage(int /* argc */, char ** argv) {
     printf("\n");
     printf("options:\n");
     printf("  -h, --help\n");
+#ifdef GGML_USE_NUMA_MIGRATE
+    printf("  --numa <distribute|isolate|numactl|migrate>\n");
+    printf("                                            numa mode (default: disabled)\n");
+#else
     printf("  --numa <distribute|isolate|numactl>       numa mode (default: disabled)\n");
+#endif
     printf("  -r, --repetitions <n>                     number of times to repeat each test (default: %d)\n",
            cmd_params_defaults.reps);
     printf("  --prio <0|1|2|3>                          process/thread priority (default: %d)\n",
@@ -628,6 +633,10 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
                     params.numa = GGML_NUMA_STRATEGY_ISOLATE;
                 } else if (value == "numactl") {
                     params.numa = GGML_NUMA_STRATEGY_NUMACTL;
+#ifdef GGML_USE_NUMA_MIGRATE
+                } else if (value == "migrate") {
+                    params.numa = GGML_NUMA_STRATEGY_MIGRATE;
+#endif
                 } else {
                     invalid_param = true;
                     break;

From 23fe7d38d2c591ea2f7f554bfbb41e23f09cfbb5 Mon Sep 17 00:00:00 2001
From: bolt <bolt.liu@arm.com>
Date: Tue, 20 May 2025 10:04:52 +0000
Subject: [PATCH 2/8] fix the threads number larger than numa node numbers

---
 ggml/src/ggml-cpu/ggml-cpu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index e2e552a6a0681..807ae1b15fad3 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -593,7 +593,7 @@ void ggml_barrier_numa_aware(struct ggml_threadpool * tp, int ith, int node_n) {
     int cores_per_numa = g_state.numa.nodes[0].n_cpus;
     int numa_nodes = n_threads / cores_per_numa;
     int remaining_cores = n_threads % cores_per_numa;
-    if (numa_nodes <= 1 || remaining_cores) {
+    if ((numa_nodes != GGML_NUMA_MIGRATE_NODES) || remaining_cores) {
         ggml_barrier(tp);
         return;
     }

From e5cb47d8073dd44e77dac2b54afd35ebf449220e Mon Sep 17 00:00:00 2001
From: bolt <bolt.liu@arm.com>
Date: Wed, 21 May 2025 02:56:50 +0000
Subject: [PATCH 3/8] fix the buffer allocate size for NUMA page migration

---
 ggml/src/ggml-alloc.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c
index 5fd379f6a9461..5d0d1331a38af 100644
--- a/ggml/src/ggml-alloc.c
+++ b/ggml/src/ggml-alloc.c
@@ -948,6 +948,22 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
         ggml_backend_buffer_type_t buft, size_t size,
         ggml_backend_buffer_t ** buffers, size_t * n_buffers) {
 
+#ifdef GGML_USE_NUMA_MIGRATE
+    size_t num_of_tensors = 0;
+    for (struct ggml_tensor * t = first; t != last; t = ggml_get_next_tensor(ctx, t)) {
+        if (t->data == NULL) {
+            if (t->view_src == NULL) {
+                num_of_tensors++;
+            }
+        }
+    }
+    size_t ps = ggml_backend_get_page_size();
+    size_t original_size = size;
+    size += ps * num_of_tensors;
+    GGML_LOG_DEBUG("alloc buffer for NUMA page migration, num of tensors: %ld, size increased from %ld to %ld, increased %ld MiB\n",
+                    num_of_tensors, original_size, size, (size - original_size) / 1024 / 1024);
+#endif
+
     ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, size);
     if (buffer == NULL) {
         GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(buft), size);

From ab80f55a5a29738a1370fdf67ab5251cf53b6745 Mon Sep 17 00:00:00 2001
From: bolt <bolt.liu@arm.com>
Date: Wed, 21 May 2025 04:33:30 +0000
Subject: [PATCH 4/8] fix the cgraph null issue when running with llama-bench

---
 ggml/src/ggml-cpu/ggml-cpu-impl.h |  6 +++++-
 ggml/src/ggml-cpu/ggml-cpu.c      | 10 +++-------
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/ggml/src/ggml-cpu/ggml-cpu-impl.h b/ggml/src/ggml-cpu/ggml-cpu-impl.h
index 486f11a2b582a..6f7cb928554c1 100644
--- a/ggml/src/ggml-cpu/ggml-cpu-impl.h
+++ b/ggml/src/ggml-cpu/ggml-cpu-impl.h
@@ -506,7 +506,11 @@ static __m256 __lasx_xvreplfr2vr_s(const float val) {
 
 // TODO: move to ggml-threading
 void ggml_barrier(struct ggml_threadpool * tp);
-#define GGML_BARRIER_NODE_LAST      -1
+enum ggml_barrier_node_index {
+    GGML_BARRIER_NODE_PING = 0,
+    GGML_BARRIER_NODE_PONG = 1,
+    GGML_BARRIER_NODE_LAST = 2,
+};
 void ggml_barrier_numa_aware(struct ggml_threadpool * tp, int ith, int node_n);
 
 #ifdef __cplusplus
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index 807ae1b15fad3..49c9567480770 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -598,10 +598,6 @@ void ggml_barrier_numa_aware(struct ggml_threadpool * tp, int ith, int node_n) {
         return;
     }
 
-    if (node_n == GGML_BARRIER_NODE_LAST) {
-        node_n = tp->cgraph->n_nodes;
-    }
-
     int node = get_node_from_cpu(ith, cores_per_numa);
 
     int n_passed = atomic_load_explicit(tp->n_barrier_passed_node[node], memory_order_relaxed);
@@ -3002,7 +2998,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
             }
 #endif
 
-            ggml_barrier_numa_aware(state->threadpool, state->ith, node_n);
+            ggml_barrier_numa_aware(state->threadpool, state->ith, node_n % GGML_BARRIER_NODE_LAST);
 
 #ifdef GGML_USE_NUMA_MIGRATE_DEBUG
             if (log_time) {
@@ -3191,8 +3187,8 @@ static struct ggml_threadpool * ggml_threadpool_new_impl(
             *threadpool->n_barrier_passed_node[node] = 0;
         }
 
-        threadpool->n_barrier_passed_last = (atomic_int *)malloc((threadpool->cgraph->n_nodes + 1) * sizeof(atomic_int));
-        for (int i = 0; i < threadpool->cgraph->n_nodes + 1; i++) {
+        threadpool->n_barrier_passed_last = (atomic_int *)malloc(GGML_BARRIER_NODE_LAST * sizeof(atomic_int));
+        for (int i = 0; i < GGML_BARRIER_NODE_LAST; i++) {
             threadpool->n_barrier_passed_last[i] = 0;
         }
 #endif

From 3e3a878a3180df0a8d7a97182557e53cf3da2d4b Mon Sep 17 00:00:00 2001
From: bolt <bolt.liu@arm.com>
Date: Wed, 21 May 2025 08:17:07 +0000
Subject: [PATCH 5/8] use ggml_barrier instead of ggml_barrier_numa_aware for
 src1 data barrier as the barrier only happens in few cores

---
 ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp b/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp
index e5c1ccc2de9a0..3fcfa9e89a6bf 100644
--- a/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp
+++ b/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp
@@ -6111,7 +6111,7 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
             from_float((float *) ((char *) src1->data + i11 * nb11), (void *) (wdata + i11 * nbw1), ne10);
         }
 
-        ggml_barrier_numa_aware(params->threadpool, ith, GGML_BARRIER_NODE_LAST);
+        ggml_barrier(params->threadpool);
 
         const void * src1_wdata      = params->wdata;
         const size_t src1_col_stride = ggml_row_size(PARAM_TYPE, ne10);
@@ -6219,7 +6219,7 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
             }
         }
 
-        ggml_barrier_numa_aware(params->threadpool, ith, GGML_BARRIER_NODE_LAST);
+        ggml_barrier(params->threadpool);
 
         // compute each matrix multiplication in sequence
         for (int cur_a = 0; cur_a < n_as; ++cur_a) {

From ed4d9873b41ae6576622ab1fb990f8f74fff3f0a Mon Sep 17 00:00:00 2001
From: bolt <bolt.liu@arm.com>
Date: Thu, 22 May 2025 09:27:57 +0000
Subject: [PATCH 6/8] remove debug code for ggml barrier

---
 ggml/src/ggml-cpu/ggml-cpu.c | 44 ------------------------------------
 1 file changed, 44 deletions(-)

diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index 49c9567480770..b89a674b15892 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -2954,37 +2954,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
         /*.threadpool=*/ tp,
     };
 
-#ifdef GGML_USE_NUMA_MIGRATE_DEBUG
-    struct timespec t0, t1, t2, t3, t4;
-    long d12, d32, d43;
-    bool log_time = true;
-    int log_node_n = 0;
-    if (log_time) {
-        clock_gettime(CLOCK_MONOTONIC, &t0);
-    }
-#endif
-
     for (int node_n = 0; node_n < cgraph->n_nodes && atomic_load_explicit(&tp->abort, memory_order_relaxed) != node_n; node_n++) {
         struct ggml_tensor * node = cgraph->nodes[node_n];
-#ifdef GGML_USE_NUMA_MIGRATE_DEBUG
-        if ((node->op == GGML_OP_MUL_MAT)) {
-            log_node_n = node_n;
-            log_time = true;
-        } else {
-            log_time = false;
-        }
-        if (log_time) {
-            clock_gettime(CLOCK_MONOTONIC, &t1);
-        }
-#endif
         ggml_compute_forward(&params, node);
 
-#ifdef GGML_USE_NUMA_MIGRATE_DEBUG
-        if (log_time) {
-            clock_gettime(CLOCK_MONOTONIC, &t2);
-        }
-#endif
-
         if (state->ith == 0 && cplan->abort_callback &&
                 cplan->abort_callback(cplan->abort_callback_data)) {
             atomic_store_explicit(&tp->abort, node_n + 1, memory_order_relaxed);
@@ -2992,24 +2965,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
         }
 
         if (node_n + 1 < cgraph->n_nodes) {
-#ifdef GGML_USE_NUMA_MIGRATE_DEBUG
-            if (log_time) {
-                clock_gettime(CLOCK_MONOTONIC, &t3);
-            }
-#endif
-
             ggml_barrier_numa_aware(state->threadpool, state->ith, node_n % GGML_BARRIER_NODE_LAST);
-
-#ifdef GGML_USE_NUMA_MIGRATE_DEBUG
-            if (log_time) {
-                clock_gettime(CLOCK_MONOTONIC, &t4);
-                d12 = (t2.tv_sec - t1.tv_sec) * 1e9 + (t2.tv_nsec - t1.tv_nsec);
-                d32 = (t3.tv_sec - t2.tv_sec) * 1e9 + (t3.tv_nsec - t2.tv_nsec);
-                d43 = (t4.tv_sec - t3.tv_sec) * 1e9 + (t4.tv_nsec - t3.tv_nsec);
-                printf("%s, op: %d, ith: %d, cpu: %d, d12: %ld, d32: %ld, d43: %ld, t1: %ld, t2: %ld, t3: %ld, t4: %ld\n", \
-                    __func__, node->op, state->ith, sched_getcpu(), d12, d32, d43, t1.tv_nsec, t2.tv_nsec, t3.tv_nsec, t4.tv_nsec);
-            }
-#endif
         }
     }
 

From e9c93714371c994ccb2db6a18883faf5f5c1ccc3 Mon Sep 17 00:00:00 2001
From: bolt <bolt.liu@arm.com>
Date: Tue, 3 Jun 2025 17:48:51 +0800
Subject: [PATCH 7/8] opt the src1 tensor data with local numa data, 33% uplift
 for tg128

---
 ggml/include/ggml-cpu.h                |  3 +++
 ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp | 30 +++++++++++++++++++++-----
 ggml/src/ggml-cpu/ggml-cpu-impl.h      |  6 ++++++
 ggml/src/ggml-cpu/ggml-cpu.c           | 18 ++++++++++++++++
 ggml/src/ggml-cpu/ggml-cpu.cpp         | 29 +++++++++++++++++++++++++
 5 files changed, 81 insertions(+), 5 deletions(-)

diff --git a/ggml/include/ggml-cpu.h b/ggml/include/ggml-cpu.h
index 54c24ec537cac..79f93910e0d88 100644
--- a/ggml/include/ggml-cpu.h
+++ b/ggml/include/ggml-cpu.h
@@ -12,6 +12,9 @@ extern "C" {
     struct ggml_cplan {
         size_t    work_size; // size of work buffer, calculated by `ggml_graph_plan()`
         uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
+#ifdef GGML_USE_NUMA_MIGRATE
+        uint8_t * work_data_numa[GGML_NUMA_MIGRATE_NODES];
+#endif
 
         int n_threads;
         struct ggml_threadpool * threadpool;
diff --git a/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp b/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp
index 3fcfa9e89a6bf..586116ba2c9b9 100644
--- a/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp
+++ b/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp
@@ -15,6 +15,7 @@
 #include <cfloat>
 #include <cstdlib> // for qsort
 #include <cstdio>  // for GGML_ASSERT
+#include <numa.h>
 
 #include "ggml-cpu-aarch64.h"
 
@@ -6094,7 +6095,12 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
         GGML_ASSERT(ggml_n_dims(op->src[0]) == 2);
         // GGML_ASSERT(ggml_n_dims(op->src[1]) == 2);
 
+#ifdef GGML_USE_NUMA_MIGRATE
+        int node_id = numa_node_of_cpu(ith);
+        char *       wdata = static_cast<char *>(params->wdata_numa[node_id]);
+#else
         char *       wdata = static_cast<char *>(params->wdata);
+#endif
         const size_t nbw1  = ggml_row_size(PARAM_TYPE, ne10);
 
         assert(params->wsize >= nbw1 * ne11);
@@ -6102,18 +6108,32 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
         const ggml_from_float_t from_float = ggml_get_type_traits_cpu(PARAM_TYPE)->from_float;
 
         int64_t i11_processed = 0;
-        for (int64_t i11 = ith * 4; i11 < ne11 - ne11 % 4; i11 += nth * 4) {
+#ifdef GGML_USE_NUMA_MIGRATE
+        int round_cnts = ggml_cores_per_numa();
+        int start_id = ith - round_cnts * node_id;
+        if (round_cnts == 0) {
+            round_cnts = nth;
+            start_id = ith;
+        }
+#else
+        int round_cnts = nth;
+        int start_id = ith;
+#endif
+        for (int64_t i11 = start_id * 4; i11 < ne11 - ne11 % 4; i11 += round_cnts * 4) {
             ggml_quantize_mat_t<INTER_SIZE, PARAM_TYPE>((float *) ((char *) src1->data + i11 * nb11), (void *) (wdata + i11 * nbw1), 4, ne10);
         }
 
         i11_processed = ne11 - ne11 % 4;
-        for (int64_t i11 = i11_processed + ith; i11 < ne11; i11 += nth) {
+        for (int64_t i11 = i11_processed + start_id; i11 < ne11; i11 += round_cnts) {
             from_float((float *) ((char *) src1->data + i11 * nb11), (void *) (wdata + i11 * nbw1), ne10);
         }
 
+#ifdef GGML_USE_NUMA_MIGRATE
+        ggml_barrier_numa_aware(params->threadpool, ith, GGML_BARRIER_NODE_LAST);
+#else
         ggml_barrier(params->threadpool);
+#endif
 
-        const void * src1_wdata      = params->wdata;
         const size_t src1_col_stride = ggml_row_size(PARAM_TYPE, ne10);
         int64_t      src0_start      = (ith * ne01) / nth;
         int64_t      src0_end        = ((ith + 1) * ne01) / nth;
@@ -6128,13 +6148,13 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
             gemm<BLOC_TYPE, INTER_SIZE, NB_COLS, PARAM_TYPE>(ne00,
                     (float *) ((char *) dst->data) + src0_start, ne01,
                     (const char *) src0->data + src0_start * nb01,
-                    (const char *) src1_wdata, ne11 - ne11 % 4, src0_end - src0_start);
+                    (const char *) wdata, ne11 - ne11 % 4, src0_end - src0_start);
         }
         for (int iter = ne11 - ne11 % 4; iter < ne11; iter++) {
             gemv<BLOC_TYPE, INTER_SIZE, NB_COLS, PARAM_TYPE>(ne00,
                     (float *) ((char *) dst->data + (iter * nb1)) + src0_start, ne01,
                     (const char *) src0->data + src0_start * nb01,
-                    (const char *) src1_wdata + (src1_col_stride * iter), 1,
+                    (const char *) wdata + (src1_col_stride * iter), 1,
                     src0_end - src0_start);
         }
     }
diff --git a/ggml/src/ggml-cpu/ggml-cpu-impl.h b/ggml/src/ggml-cpu/ggml-cpu-impl.h
index 6f7cb928554c1..a3c633a6c979a 100644
--- a/ggml/src/ggml-cpu/ggml-cpu-impl.h
+++ b/ggml/src/ggml-cpu/ggml-cpu-impl.h
@@ -22,6 +22,9 @@ struct ggml_compute_params {
     // work buffer for all threads
     size_t wsize;
     void * wdata;
+#ifdef GGML_USE_NUMA_MIGRATE
+    void * wdata_numa[GGML_NUMA_MIGRATE_NODES];
+#endif
 
     struct ggml_threadpool * threadpool;
 };
@@ -512,6 +515,9 @@ enum ggml_barrier_node_index {
     GGML_BARRIER_NODE_LAST = 2,
 };
 void ggml_barrier_numa_aware(struct ggml_threadpool * tp, int ith, int node_n);
+#ifdef GGML_USE_NUMA_MIGRATE
+int ggml_cores_per_numa(void);
+#endif
 
 #ifdef __cplusplus
 }
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index b89a674b15892..994d4a56d12af 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -571,6 +571,10 @@ void ggml_barrier(struct ggml_threadpool * tp) {
 static int get_node_from_cpu(int cpu, int cores_per_numa) {
     return cpu / cores_per_numa;
 }
+
+int ggml_cores_per_numa(void) {
+    return g_state.numa.nodes[0].n_cpus;
+}
 #endif
 
 void ggml_barrier_numa_aware(struct ggml_threadpool * tp, int ith, int node_n) {
@@ -2933,6 +2937,11 @@ struct ggml_cplan ggml_graph_plan(
     cplan.n_threads  = MIN(max_tasks, n_threads);
     cplan.work_size  = work_size;
     cplan.work_data  = NULL;
+#ifdef GGML_USE_NUMA_MIGRATE
+    for (int i = 0; i < GGML_NUMA_MIGRATE_NODES; i++) {
+        cplan.work_data_numa[i] = NULL;
+    }
+#endif
 
     return cplan;
 }
@@ -2951,9 +2960,18 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
         /*.nth       =*/ atomic_load_explicit(&tp->n_threads_cur, memory_order_relaxed),
         /*.wsize     =*/ cplan->work_size,
         /*.wdata     =*/ cplan->work_data,
+#ifdef GGML_USE_NUMA_MIGRATE
+        /*.wdata_numa     =*/ {NULL, NULL},
+#endif
         /*.threadpool=*/ tp,
     };
 
+#ifdef GGML_USE_NUMA_MIGRATE
+    for (int i = 0; i < GGML_NUMA_MIGRATE_NODES; i++) {
+        params.wdata_numa[i] = cplan->work_data_numa[numa_node_of_cpu(state->ith)];
+    }
+#endif
+
     for (int node_n = 0; node_n < cgraph->n_nodes && atomic_load_explicit(&tp->abort, memory_order_relaxed) != node_n; node_n++) {
         struct ggml_tensor * node = cgraph->nodes[node_n];
         ggml_compute_forward(&params, node);
diff --git a/ggml/src/ggml-cpu/ggml-cpu.cpp b/ggml/src/ggml-cpu/ggml-cpu.cpp
index e013e8b416222..a8b3b2d26ccc3 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.cpp
+++ b/ggml/src/ggml-cpu/ggml-cpu.cpp
@@ -9,6 +9,7 @@
 #include <cctype>
 #include <string>
 #include <vector>
+#include <numa.h>
 
 #ifdef GGML_USE_CPU_HBM
 #    include "ggml-cpu-hbm.h"
@@ -87,6 +88,9 @@ struct ggml_backend_cpu_context {
     ggml_threadpool_t   threadpool;
 
     uint8_t *           work_data;
+#ifdef GGML_USE_NUMA_MIGRATE
+    uint8_t *           work_data_numa[GGML_NUMA_MIGRATE_NODES];
+#endif
     size_t              work_size;
 
     ggml_abort_callback abort_callback;
@@ -102,6 +106,11 @@ static const char * ggml_backend_cpu_get_name(ggml_backend_t backend) {
 static void ggml_backend_cpu_free(ggml_backend_t backend) {
     struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
     delete[] cpu_ctx->work_data;
+#ifdef GGML_USE_NUMA_MIGRATE
+    for (int i = 0; i < GGML_NUMA_MIGRATE_NODES; i++) {
+        numa_free(cpu_ctx->work_data_numa[i], cpu_ctx->work_size);
+    }
+#endif
     delete cpu_ctx;
     delete backend;
 }
@@ -162,9 +171,24 @@ static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, s
             cpu_ctx->work_size = 0;
             return GGML_STATUS_ALLOC_FAILED;
         }
+
+#ifdef GGML_USE_NUMA_MIGRATE
+        for (int i = 0; i < GGML_NUMA_MIGRATE_NODES; i++) {
+            cpu_ctx->work_data_numa[i] = (uint8_t *)numa_alloc_onnode(cplan.work_size, i);
+            if (cpu_ctx->work_data_numa[i] == NULL) {
+                cpu_ctx->work_size = 0;
+                return GGML_STATUS_ALLOC_FAILED;
+            }
+         }
+#endif
         cpu_ctx->work_size = cplan.work_size;
     }
     cplan.work_data = (uint8_t *)cpu_ctx->work_data;
+#ifdef GGML_USE_NUMA_MIGRATE
+    for (int i = 0; i < GGML_NUMA_MIGRATE_NODES; i++) {
+        cplan.work_data_numa[i] = (uint8_t *)(cpu_ctx->work_data_numa[i]);
+    }
+#endif
 
     cplan.abort_callback      = cpu_ctx->abort_callback;
     cplan.abort_callback_data = cpu_ctx->abort_callback_data;
@@ -205,6 +229,11 @@ ggml_backend_t ggml_backend_cpu_init(void) {
     ctx->n_threads           = GGML_DEFAULT_N_THREADS;
     ctx->threadpool          = NULL;
     ctx->work_data           = NULL;
+#ifdef GGML_USE_NUMA_MIGRATE
+    for (int i = 0; i < GGML_NUMA_MIGRATE_NODES; i++) {
+        ctx->work_data_numa[i]    = NULL;
+    }
+#endif
     ctx->work_size           = 0;
     ctx->abort_callback      = NULL;
     ctx->abort_callback_data = NULL;

From 88358235e186d2522e257193887518bcfab83245 Mon Sep 17 00:00:00 2001
From: bolt <bolt.liu@arm.com>
Date: Thu, 12 Jun 2025 13:40:55 +0800
Subject: [PATCH 8/8] fix the perf regression with numa_node_of_cpu() system
 call and the n_barrier_passed_last indexing issue

---
 ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp |  2 +-
 ggml/src/ggml-cpu/ggml-cpu-impl.h      |  2 ++
 ggml/src/ggml-cpu/ggml-cpu.c           | 24 ++++++++++++------------
 3 files changed, 15 insertions(+), 13 deletions(-)

diff --git a/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp b/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp
index 586116ba2c9b9..2ffee98171e98 100644
--- a/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp
+++ b/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp
@@ -6096,7 +6096,7 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
         // GGML_ASSERT(ggml_n_dims(op->src[1]) == 2);
 
 #ifdef GGML_USE_NUMA_MIGRATE
-        int node_id = numa_node_of_cpu(ith);
+        int node_id = ggml_get_node_from_cpu(ith);
         char *       wdata = static_cast<char *>(params->wdata_numa[node_id]);
 #else
         char *       wdata = static_cast<char *>(params->wdata);
diff --git a/ggml/src/ggml-cpu/ggml-cpu-impl.h b/ggml/src/ggml-cpu/ggml-cpu-impl.h
index a3c633a6c979a..e74b73b29133d 100644
--- a/ggml/src/ggml-cpu/ggml-cpu-impl.h
+++ b/ggml/src/ggml-cpu/ggml-cpu-impl.h
@@ -513,10 +513,12 @@ enum ggml_barrier_node_index {
     GGML_BARRIER_NODE_PING = 0,
     GGML_BARRIER_NODE_PONG = 1,
     GGML_BARRIER_NODE_LAST = 2,
+    GGML_BARRIER_NODE_CNTS = 3
 };
 void ggml_barrier_numa_aware(struct ggml_threadpool * tp, int ith, int node_n);
 #ifdef GGML_USE_NUMA_MIGRATE
 int ggml_cores_per_numa(void);
+int ggml_get_node_from_cpu(int cpu);
 #endif
 
 #ifdef __cplusplus
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index 994d4a56d12af..a76477b8a8b3c 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -445,7 +445,7 @@ struct ggml_threadpool {
 #ifdef GGML_USE_NUMA_MIGRATE
     atomic_int GGML_CACHE_ALIGN *n_barrier_node[GGML_NUMA_MIGRATE_NODES];
     atomic_int GGML_CACHE_ALIGN *n_barrier_passed_node[GGML_NUMA_MIGRATE_NODES];
-    atomic_int GGML_CACHE_ALIGN *n_barrier_passed_last;
+    atomic_int GGML_CACHE_ALIGN *n_barrier_passed_last[GGML_BARRIER_NODE_CNTS];
 #endif
 
     atomic_int GGML_CACHE_ALIGN current_chunk; // currently processing chunk during Mat_Mul, shared between all the threads.
@@ -568,8 +568,8 @@ void ggml_barrier(struct ggml_threadpool * tp) {
 }
 
 #ifdef GGML_USE_NUMA_MIGRATE
-static int get_node_from_cpu(int cpu, int cores_per_numa) {
-    return cpu / cores_per_numa;
+int ggml_get_node_from_cpu(int cpu) {
+    return cpu / g_state.numa.nodes[0].n_cpus;
 }
 
 int ggml_cores_per_numa(void) {
@@ -594,7 +594,7 @@ void ggml_barrier_numa_aware(struct ggml_threadpool * tp, int ith, int node_n) {
         return;
     }
 
-    int cores_per_numa = g_state.numa.nodes[0].n_cpus;
+    int cores_per_numa = ggml_cores_per_numa();
     int numa_nodes = n_threads / cores_per_numa;
     int remaining_cores = n_threads % cores_per_numa;
     if ((numa_nodes != GGML_NUMA_MIGRATE_NODES) || remaining_cores) {
@@ -602,7 +602,7 @@ void ggml_barrier_numa_aware(struct ggml_threadpool * tp, int ith, int node_n) {
         return;
     }
 
-    int node = get_node_from_cpu(ith, cores_per_numa);
+    int node = ggml_get_node_from_cpu(ith);
 
     int n_passed = atomic_load_explicit(tp->n_barrier_passed_node[node], memory_order_relaxed);
 
@@ -613,13 +613,13 @@ void ggml_barrier_numa_aware(struct ggml_threadpool * tp, int ith, int node_n) {
         // last thread of current numa node
         atomic_store_explicit(tp->n_barrier_node[node], 0, memory_order_seq_cst);
 
-        int n_passed_node = atomic_fetch_add_explicit(&tp->n_barrier_passed_last[node_n], 1, memory_order_seq_cst);
+        int n_passed_node = atomic_fetch_add_explicit(tp->n_barrier_passed_last[node_n], 1, memory_order_seq_cst);
 
         if (n_passed_node == (numa_nodes - 1)) { // last numa node cpu
             atomic_fetch_add_explicit(tp->n_barrier_passed_node[node], 1, memory_order_seq_cst);
-            atomic_store_explicit(&tp->n_barrier_passed_last[node_n], 0, memory_order_seq_cst);
+            atomic_store_explicit(tp->n_barrier_passed_last[node_n], 0, memory_order_seq_cst);
         } else {
-            while (atomic_load_explicit(&tp->n_barrier_passed_last[node_n], memory_order_relaxed)) {
+            while (atomic_load_explicit(tp->n_barrier_passed_last[node_n], memory_order_relaxed)) {
                 ggml_thread_cpu_relax();
             }
             atomic_fetch_add_explicit(tp->n_barrier_passed_node[node], 1, memory_order_seq_cst);
@@ -2968,7 +2968,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
 
 #ifdef GGML_USE_NUMA_MIGRATE
     for (int i = 0; i < GGML_NUMA_MIGRATE_NODES; i++) {
-        params.wdata_numa[i] = cplan->work_data_numa[numa_node_of_cpu(state->ith)];
+        params.wdata_numa[i] = cplan->work_data_numa[ggml_get_node_from_cpu(state->ith)];
     }
 #endif
 
@@ -3161,9 +3161,9 @@ static struct ggml_threadpool * ggml_threadpool_new_impl(
             *threadpool->n_barrier_passed_node[node] = 0;
         }
 
-        threadpool->n_barrier_passed_last = (atomic_int *)malloc(GGML_BARRIER_NODE_LAST * sizeof(atomic_int));
-        for (int i = 0; i < GGML_BARRIER_NODE_LAST; i++) {
-            threadpool->n_barrier_passed_last[i] = 0;
+        for (int i = 0; i < GGML_BARRIER_NODE_CNTS; i++) {
+            threadpool->n_barrier_passed_last[i] = (atomic_int *)malloc(sizeof(atomic_int));
+            *threadpool->n_barrier_passed_last[i] = 0;
         }
 #endif