From a7a6ce930e6ab2ca2f917356fb57e7ebc772d10e Mon Sep 17 00:00:00 2001 From: bolt Date: Tue, 20 May 2025 06:57:09 +0000 Subject: [PATCH 1/8] add GGML_USE_NUMA_MIGRATE feature to optimize cross NUMA op computation --- common/arg.cpp | 6 + ggml/CMakeLists.txt | 5 + ggml/include/ggml-backend.h | 3 + ggml/include/ggml-cpu.h | 3 + ggml/src/CMakeLists.txt | 24 ++ ggml/src/ggml-backend.cpp | 312 ++++++++++++++++++++++++ ggml/src/ggml-cpu/amx/amx.cpp | 4 + ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp | 8 +- ggml/src/ggml-cpu/ggml-cpu-impl.h | 2 + ggml/src/ggml-cpu/ggml-cpu.c | 182 +++++++++++++- ggml/src/ggml-cpu/kleidiai/kleidiai.cpp | 4 + src/llama-model-loader.cpp | 4 + tools/llama-bench/llama-bench.cpp | 9 + 13 files changed, 561 insertions(+), 5 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index e2676bb878e28..e4592bfb847bf 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -2276,12 +2276,18 @@ common_params_context common_params_parser_init(common_params & params, llama_ex "- distribute: spread execution evenly over all nodes\n" "- isolate: only spawn threads on CPUs on the node that execution started on\n" "- numactl: use the CPU map provided by numactl\n" +#ifdef GGML_USE_NUMA_MIGRATE + "- migrate: for affinity threads with page migration across NUMA nodes\n" +#endif "if run without this previously, it is recommended to drop the system page cache before using this\n" "see https://github.com/ggml-org/llama.cpp/issues/1437", [](common_params & params, const std::string & value) { /**/ if (value == "distribute" || value == "") { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; } else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; } else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; } +#ifdef GGML_USE_NUMA_MIGRATE + else if (value == "migrate") { params.numa = GGML_NUMA_STRATEGY_MIGRATE; } +#endif else { throw std::invalid_argument("invalid value"); } } ).set_env("LLAMA_ARG_NUMA")); diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index 4746d5cb76c08..dd2d91ed86799 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -151,6 +151,11 @@ set(GGML_BLAS_VENDOR ${GGML_BLAS_VENDOR_DEFAULT} CACHE STRING "ggml: BLAS library vendor") option(GGML_LLAMAFILE "ggml: use LLAMAFILE" ${GGML_LLAMAFILE_DEFAULT}) +option(GGML_NUMA_MIGRATE "ggml: use NUMA_MIGRATE" OFF) +set(GGML_NUMA_MIGRATE_NODES "2" CACHE STRING + "ggml: the number of NUMA nodes during page migration") +option(GGML_NUMA_MIGRATE_DEBUG "ggml: enable debugging of NUMA_MIGRATE" OFF) + option(GGML_CUDA "ggml: use CUDA" OFF) option(GGML_MUSA "ggml: use MUSA" OFF) option(GGML_CUDA_FORCE_MMQ "ggml: use mmq kernels instead of cuBLAS" OFF) diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h index 778927f68217a..6b02d5f24a54a 100644 --- a/ggml/include/ggml-backend.h +++ b/ggml/include/ggml-backend.h @@ -348,6 +348,9 @@ extern "C" { // CPU buffer types are always available GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size); GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void); +#ifdef GGML_USE_NUMA_MIGRATE + GGML_API size_t ggml_backend_get_page_size(void); +#endif #ifdef __cplusplus } diff --git a/ggml/include/ggml-cpu.h b/ggml/include/ggml-cpu.h index de77a875ec533..54c24ec537cac 100644 --- a/ggml/include/ggml-cpu.h +++ b/ggml/include/ggml-cpu.h @@ -28,6 +28,9 @@ extern "C" { GGML_NUMA_STRATEGY_ISOLATE = 2, GGML_NUMA_STRATEGY_NUMACTL = 3, GGML_NUMA_STRATEGY_MIRROR = 4, +#ifdef GGML_USE_NUMA_MIGRATE + GGML_NUMA_STRATEGY_MIGRATE = 5, +#endif GGML_NUMA_STRATEGY_COUNT }; diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index ddea5ad3891e5..6da09ff842d8b 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -343,3 +343,27 @@ if (BUILD_SHARED_LIBS) target_compile_definitions(${target} PUBLIC GGML_SHARED) endforeach() endif() + +if (GGML_NUMA_MIGRATE) + find_path(NUMA_ROOT_DIR + NAMES include/numa.h + PATHS ENV NUMA_ROOT + DOC "NUMA root directory") + + find_library(NUMA_LIBRARY + NAMES numa + HINTS ${NUMA_ROOT_DIR} + DOC "NUMA library") + + if (NOT NUMA_LIBRARY) + message(FATAL_ERROR "Could NOT find NUMA library.") + endif() + + if (GGML_NUMA_MIGRATE_DEBUG) + target_compile_definitions(ggml-base PUBLIC GGML_USE_NUMA_MIGRATE GGML_NUMA_MIGRATE_NODES=${GGML_NUMA_MIGRATE_NODES} GGML_USE_NUMA_MIGRATE_DEBUG) + else() + target_compile_definitions(ggml-base PUBLIC GGML_USE_NUMA_MIGRATE GGML_NUMA_MIGRATE_NODES=${GGML_NUMA_MIGRATE_NODES}) + endif() + + target_link_libraries(ggml-base PRIVATE ${NUMA_LIBRARY}) +endif() diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index b30b4cb386f9f..fd31530becf2c 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -22,12 +22,51 @@ #include #include #include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include #ifdef __APPLE__ #include #include #endif +#ifdef GGML_USE_NUMA_MIGRATE +class numa_migrate_mapping_cache { +public: + void * addr; + int size; + numa_migrate_mapping_cache(void *addr, int size): addr(addr), size(size) { } + + bool operator<(const numa_migrate_mapping_cache& other) const { + if (addr != other.addr) { + return addr < other.addr; + } else { + return size < other.size; + } + } + + bool operator==(const numa_migrate_mapping_cache& other) const { + return (addr == other.addr && size == other.size); + } + +}; + +static std::set ggml_mapping_cache; +static size_t ggml_backend_page_size = 0; +static std::mutex ggml_mapping_mutex; +#endif // backend buffer type @@ -1658,6 +1697,244 @@ enum ggml_status ggml_backend_view_init(struct ggml_tensor * tensor) { return ggml_backend_buffer_init_tensor(tensor->buffer, tensor); } +#ifdef GGML_USE_NUMA_MIGRATE +#ifdef GGML_USE_NUMA_MIGRATE_DEBUG +static int check_numa_pages_migration(void *addr, size_t total_size) { + if (total_size % ggml_backend_page_size != 0) { + return -1; + } + + size_t offset = 0; // Offset in bytes from the start of the allocated memory + int num_nodes = GGML_NUMA_MIGRATE_NODES; + + for (int i = 0; i < num_nodes; ++i) { + int target_node = i; + size_t size_to_migrate = total_size / num_nodes; + + if (size_to_migrate > total_size - offset) { + GGML_LOG_ERROR( + "Error: Size to migrate to node %d exceeds remaining memory, " + "size_to_migrate: %ld, total: %ld\n", + target_node, size_to_migrate, total_size); + return -1; + } + + size_t num_pages_to_migrate = size_to_migrate / ggml_backend_page_size; + if (size_to_migrate % ggml_backend_page_size != 0) { + GGML_LOG_WARN("Warning: Size to migrate to node %ld is not a " + "multiple of page size, total: %ld size_to_migrate: " + "%ld, ggml_backend_page_size: %ld.\n", + target_node, total_size, size_to_migrate, + ggml_backend_page_size); + return -1; + } + + if (num_pages_to_migrate == 0) { + GGML_LOG_WARN("Warning: No pages to migrate to node %d.\n", + target_node); + continue; + } + + void *migrate_start_addr = (char *)addr + (i)*size_to_migrate; + + int *status = (int *)malloc(num_pages_to_migrate * sizeof(int)); + if (!status) { + GGML_LOG_ERROR("malloc for status failed"); + return -1; + } + memset(status, 0, num_pages_to_migrate * sizeof(int)); + + int *nodes = (int *)malloc(num_pages_to_migrate * sizeof(int)); + if (!nodes) { + GGML_LOG_ERROR("malloc for nodes failed"); + return -1; + } + memset(nodes, 0, num_pages_to_migrate * sizeof(int)); + + void **addr_to_migrate = + (void **)malloc(num_pages_to_migrate * sizeof(void *)); + for (size_t j = 0; j < num_pages_to_migrate; j++) { + status[j] = 0; + nodes[j] = i; + addr_to_migrate[j] = (void *)((char *)migrate_start_addr + + j * ggml_backend_page_size); + } + + // check if pages are migrated + int ret = move_pages(0, num_pages_to_migrate, addr_to_migrate, NULL, + status, MPOL_MF_MOVE); + if (ret < 0) { + GGML_LOG_ERROR("check pages failed"); + free(status); + free(nodes); + return -1; + } + + for (size_t j = 0; j < num_pages_to_migrate; ++j) { + if (status[j] != target_node) { + GGML_LOG_WARN("Warning: Page %zu migration status to node %d: " + "%d, ret: %d, addr: %p\n", + j, target_node, status[j], ret, + addr_to_migrate[j]); + if (status[j] == -ENODEV) { + GGML_LOG_ERROR( + " - Error: No such device (NUMA node problem)\n"); + } else if (status[j] == -EPERM) { + GGML_LOG_ERROR( + " - Error: Operation not permitted (permissions)\n"); + } else if (status[j] == -ENOENT) { + GGML_LOG_ERROR(" - Error: ENOENT\n"); + } else if (status[j] == -EFAULT) { + GGML_LOG_ERROR(" - Error: Bad address\n"); + } else if (status[j] == -EINVAL) { + GGML_LOG_ERROR(" - Error: Invalid argument\n"); + } else if (status[j] == -ENOMEM) { + GGML_LOG_ERROR(" - Error: Out of memory\n"); + } else if (status[j] == -EACCES) { + GGML_LOG_ERROR(" - Error: access\n"); + } else if (status[j] == -ESRCH) { + GGML_LOG_ERROR(" - Error: access\n"); + } else { + GGML_LOG_ERROR(" - Error: Unknown status code at j: %ld: " + "%d, total_size: %ld\n", + j, status[j], total_size); + } + + exit(0); + return -1; + } + } + + free(status); + free(nodes); + free(addr_to_migrate); + + offset += size_to_migrate; + } + + GGML_LOG_INFO( + "page migration check passed at %p, size: %ld, num nodes: %d\n", addr, + total_size, num_nodes); + return 0; +} +#endif + +// Function to migrate pages to multiple NUMA nodes. +static int migrate_pages_multiple_nodes(void *addr, size_t total_size) { + + if (total_size % ggml_backend_page_size != 0) { + GGML_LOG_WARN("Warning: Total size is not a multiple of page size. " + "Some memory may not be migrated.\n"); + return -1; + } + + size_t offset = 0; // Offset in bytes from the start of the allocated memory + int num_nodes = GGML_NUMA_MIGRATE_NODES; + + for (int i = 0; i < num_nodes; ++i) { + int target_node = i; + size_t size_to_migrate = total_size / num_nodes; + + if (size_to_migrate > total_size - offset) { + GGML_LOG_ERROR( + "Error: Size to migrate to node %d exceeds remaining memory, " + "size_to_migrate: %ld, total: %ld\n", + target_node, size_to_migrate, total_size); + return -1; + } + + size_t num_pages_to_migrate = size_to_migrate / ggml_backend_page_size; + if (size_to_migrate % ggml_backend_page_size != 0) { +#ifdef GGML_USE_NUMA_MIGRATE_DEBUG + GGML_LOG_WARN("Warning: Size to migrate to node %ld is not a " + "multiple of page size, total: %ld size_to_migrate: " + "%ld, ggml_backend_page_size: %ld.\n", + target_node, total_size, size_to_migrate, + ggml_backend_page_size); +#endif + return -1; + } + + if (num_pages_to_migrate == 0) { + GGML_LOG_WARN("Warning: No pages to migrate to node %d.\n", + target_node); + continue; + } + + void *migrate_start_addr = (char *)addr + (i)*size_to_migrate; + + int *status = (int *)malloc(num_pages_to_migrate * sizeof(int)); + if (!status) { + GGML_LOG_ERROR("malloc for status failed"); + return -1; + } + memset(status, 0, num_pages_to_migrate * sizeof(int)); + + int *nodes = (int *)malloc(num_pages_to_migrate * sizeof(int)); + if (!nodes) { + GGML_LOG_ERROR("malloc for nodes failed"); + return -1; + } + memset(nodes, 0, num_pages_to_migrate * sizeof(int)); + + void **addr_to_migrate = + (void **)malloc(num_pages_to_migrate * sizeof(void *)); + for (size_t j = 0; j < num_pages_to_migrate; j++) { + status[j] = 0; + nodes[j] = i; + addr_to_migrate[j] = (void *)((char *)migrate_start_addr + + j * ggml_backend_page_size); + } + + int ret = move_pages(0, num_pages_to_migrate, addr_to_migrate, nodes, + status, MPOL_MF_MOVE); + if (ret < 0) { + GGML_LOG_ERROR("move_pages failed"); + free(status); + free(nodes); + return -1; + } + + free(status); + free(nodes); + free(addr_to_migrate); + + offset += size_to_migrate; + } + + return 0; +} + +static void migrate_pages_with_cache(void *addr, size_t size, + bool force_memset) { + if (size >= GGML_NUMA_MIGRATE_NODES * ggml_backend_page_size) { + numa_migrate_mapping_cache current_addr(addr, size); + std::lock_guard lock(ggml_mapping_mutex); + auto it = ggml_mapping_cache.find(current_addr); + if (it == ggml_mapping_cache.end()) { + GGML_ASSERT(((uint64_t)(addr) & (ggml_backend_page_size - 1)) == 0); + int num_pages = + size / ggml_backend_page_size / GGML_NUMA_MIGRATE_NODES; + if (num_pages && ((size % ggml_backend_page_size) == 0)) { + if (force_memset) { + memset(addr, 0, size); // force to allocate memory + } + if (migrate_pages_multiple_nodes(addr, size) != 0) { + GGML_LOG_DEBUG("Migration to multiple nodes failed, addr: " + "%p, size: %ld\n", + addr, size); + } else { +#ifdef GGML_USE_NUMA_MIGRATE_DEBUG + check_numa_pages_migration(addr, size); +#endif + } + ggml_mapping_cache.insert(current_addr); + } + } + } +} +#endif + enum ggml_status ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr) { GGML_ASSERT(tensor->buffer == NULL); GGML_ASSERT(tensor->data == NULL); @@ -1668,6 +1945,11 @@ enum ggml_status ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct tensor->buffer = buffer; tensor->data = addr; + +#ifdef GGML_USE_NUMA_MIGRATE + size_t size = ggml_backend_buffer_get_alloc_size(buffer, tensor); + migrate_pages_with_cache(tensor->data, size, true); +#endif return ggml_backend_buffer_init_tensor(buffer, tensor); } @@ -1861,16 +2143,28 @@ bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) { uintptr_t data = (uintptr_t)buffer->context; +#ifdef GGML_USE_NUMA_MIGRATE + // align the buffer + if (data % ggml_backend_page_size != 0) { + data = GGML_PAD(data, ggml_backend_page_size); + } +#else // align the buffer if (data % TENSOR_ALIGNMENT != 0) { data = GGML_PAD(data, TENSOR_ALIGNMENT); } +#endif return (void *)data; } static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) { +#ifdef GGML_USE_NUMA_MIGRATE + numa_free(buffer->context, buffer->size); +#else ggml_aligned_free(buffer->context, buffer->size); +#endif + } static void ggml_backend_cpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) { @@ -1939,8 +2233,22 @@ static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_ty GGML_UNUSED(buft); } +#ifdef GGML_USE_NUMA_MIGRATE +size_t ggml_backend_get_page_size(void) { + if (ggml_backend_page_size == 0) { + ggml_backend_page_size = sysconf(_SC_PAGE_SIZE); + } + return ggml_backend_page_size; +} +#endif + static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { +#ifdef GGML_USE_NUMA_MIGRATE + ggml_backend_get_page_size(); + void * data = numa_alloc_onnode(size, 0); +#else void * data = ggml_aligned_malloc(size); +#endif if (data == NULL) { GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__, size); @@ -1951,7 +2259,11 @@ static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_back } static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { +#ifdef GGML_USE_NUMA_MIGRATE + return ggml_backend_get_page_size(); +#else return TENSOR_ALIGNMENT; +#endif GGML_UNUSED(buft); } diff --git a/ggml/src/ggml-cpu/amx/amx.cpp b/ggml/src/ggml-cpu/amx/amx.cpp index 0f067137df006..222d8095cae74 100644 --- a/ggml/src/ggml-cpu/amx/amx.cpp +++ b/ggml/src/ggml-cpu/amx/amx.cpp @@ -133,7 +133,11 @@ static ggml_backend_buffer_t ggml_backend_amx_buffer_type_alloc_buffer(ggml_back } static size_t ggml_backend_amx_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { +#ifdef GGML_USE_NUMA_MIGRATE + return ggml_backend_get_page_size(); +#else return TENSOR_ALIGNMENT; +#endif GGML_UNUSED(buft); } diff --git a/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp b/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp index 8ff6d64a4d0d1..e5c1ccc2de9a0 100644 --- a/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +++ b/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp @@ -6111,7 +6111,7 @@ template data + i11 * nb11), (void *) (wdata + i11 * nbw1), ne10); } - ggml_barrier(params->threadpool); + ggml_barrier_numa_aware(params->threadpool, ith, GGML_BARRIER_NODE_LAST); const void * src1_wdata = params->wdata; const size_t src1_col_stride = ggml_row_size(PARAM_TYPE, ne10); @@ -6219,7 +6219,7 @@ template threadpool); + ggml_barrier_numa_aware(params->threadpool, ith, GGML_BARRIER_NODE_LAST); // compute each matrix multiplication in sequence for (int cur_a = 0; cur_a < n_as; ++cur_a) { @@ -6359,7 +6359,11 @@ static ggml_backend_buffer_t ggml_backend_cpu_aarch64_buffer_type_alloc_buffer(g } static size_t ggml_backend_cpu_aarch64_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { +#ifdef GGML_USE_NUMA_MIGRATE + return ggml_backend_get_page_size(); +#else return TENSOR_ALIGNMENT; +#endif GGML_UNUSED(buft); } diff --git a/ggml/src/ggml-cpu/ggml-cpu-impl.h b/ggml/src/ggml-cpu/ggml-cpu-impl.h index e4af07635c157..486f11a2b582a 100644 --- a/ggml/src/ggml-cpu/ggml-cpu-impl.h +++ b/ggml/src/ggml-cpu/ggml-cpu-impl.h @@ -506,6 +506,8 @@ static __m256 __lasx_xvreplfr2vr_s(const float val) { // TODO: move to ggml-threading void ggml_barrier(struct ggml_threadpool * tp); +#define GGML_BARRIER_NODE_LAST -1 +void ggml_barrier_numa_aware(struct ggml_threadpool * tp, int ith, int node_n); #ifdef __cplusplus } diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index 133b50606bcd1..e2e552a6a0681 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -34,6 +34,7 @@ #include #include #include +#include #if defined(__gnu_linux__) #include #endif @@ -440,6 +441,13 @@ struct ggml_threadpool { atomic_int n_graph; // incremented when there is work to be done (i.e each graph) atomic_int GGML_CACHE_ALIGN n_barrier; atomic_int GGML_CACHE_ALIGN n_barrier_passed; + +#ifdef GGML_USE_NUMA_MIGRATE + atomic_int GGML_CACHE_ALIGN *n_barrier_node[GGML_NUMA_MIGRATE_NODES]; + atomic_int GGML_CACHE_ALIGN *n_barrier_passed_node[GGML_NUMA_MIGRATE_NODES]; + atomic_int GGML_CACHE_ALIGN *n_barrier_passed_last; +#endif + atomic_int GGML_CACHE_ALIGN current_chunk; // currently processing chunk during Mat_Mul, shared between all the threads. // these are atomic as an annotation for thread-sanitizer @@ -505,6 +513,10 @@ struct ggml_numa_nodes { #else uint32_t cpuset; // no NUMA support outside of Linux at this time. Use a portable datatype #endif + +#ifdef GGML_USE_NUMA_MIGRATE + bool even_distributed; +#endif }; // @@ -555,6 +567,74 @@ void ggml_barrier(struct ggml_threadpool * tp) { #endif } +#ifdef GGML_USE_NUMA_MIGRATE +static int get_node_from_cpu(int cpu, int cores_per_numa) { + return cpu / cores_per_numa; +} +#endif + +void ggml_barrier_numa_aware(struct ggml_threadpool * tp, int ith, int node_n) { +#ifndef GGML_USE_NUMA_MIGRATE + UNUSED(ith); + UNUSED(node_n); + ggml_barrier(tp); + return; +#else + if ((g_state.numa.numa_strategy != GGML_NUMA_STRATEGY_MIGRATE) || !g_state.numa.even_distributed) { + ggml_barrier(tp); + return; + } + + int n_threads = atomic_load_explicit(&tp->n_threads_cur, memory_order_relaxed); + if (n_threads == 1) { + return; + } + + int cores_per_numa = g_state.numa.nodes[0].n_cpus; + int numa_nodes = n_threads / cores_per_numa; + int remaining_cores = n_threads % cores_per_numa; + if (numa_nodes <= 1 || remaining_cores) { + ggml_barrier(tp); + return; + } + + if (node_n == GGML_BARRIER_NODE_LAST) { + node_n = tp->cgraph->n_nodes; + } + + int node = get_node_from_cpu(ith, cores_per_numa); + + int n_passed = atomic_load_explicit(tp->n_barrier_passed_node[node], memory_order_relaxed); + + // enter barrier (full seq-cst fence) + int n_barrier = atomic_fetch_add_explicit(tp->n_barrier_node[node], 1, memory_order_seq_cst); + + if (n_barrier == (cores_per_numa - 1)) { + // last thread of current numa node + atomic_store_explicit(tp->n_barrier_node[node], 0, memory_order_seq_cst); + + int n_passed_node = atomic_fetch_add_explicit(&tp->n_barrier_passed_last[node_n], 1, memory_order_seq_cst); + + if (n_passed_node == (numa_nodes - 1)) { // last numa node cpu + atomic_fetch_add_explicit(tp->n_barrier_passed_node[node], 1, memory_order_seq_cst); + atomic_store_explicit(&tp->n_barrier_passed_last[node_n], 0, memory_order_seq_cst); + } else { + while (atomic_load_explicit(&tp->n_barrier_passed_last[node_n], memory_order_relaxed)) { + ggml_thread_cpu_relax(); + } + atomic_fetch_add_explicit(tp->n_barrier_passed_node[node], 1, memory_order_seq_cst); + } + + return; + } + + // wait for other threads + while (atomic_load_explicit(tp->n_barrier_passed_node[node], memory_order_seq_cst) == n_passed) { + ggml_thread_cpu_relax(); + } +#endif +} + #if defined(__gnu_linux__) static cpu_set_t ggml_get_numa_affinity(void) { cpu_set_t cpuset; @@ -631,6 +711,10 @@ void ggml_numa_init(enum ggml_numa_strategy numa_flag) { struct ggml_numa_node * node = &g_state.numa.nodes[n]; GGML_PRINT_DEBUG("CPUs on node %u:", n); node->n_cpus = 0; + +#ifdef GGML_USE_NUMA_MIGRATE + g_state.numa.even_distributed = true; +#endif for (uint32_t c = 0; c < g_state.numa.total_cpus; ++c) { rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u/cpu%u", n, c); GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path)); @@ -640,6 +724,11 @@ void ggml_numa_init(enum ggml_numa_strategy numa_flag) { } } GGML_PRINT_DEBUG("\n"); +#ifdef GGML_USE_NUMA_MIGRATE + if ((n != 0) && (g_state.numa.nodes[n].n_cpus != g_state.numa.nodes[0].n_cpus)) { + g_state.numa.even_distributed = false; + } +#endif } if (ggml_is_numa()) { @@ -2070,6 +2159,30 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm // Android's libc implementation "bionic" does not support setting affinity #if defined(__gnu_linux__) + +#ifdef GGML_USE_NUMA_MIGRATE +static void set_numa_migrate_affinity(int core_no) { + // Check if the core number is valid + if (core_no < 0 || core_no >= (int)g_state.numa.total_cpus) { + printf("%s, Warn: core_no not between 0 and %d, failback.\n", __func__, g_state.numa.total_cpus); + return; + } + + cpu_set_t cpuset; + CPU_ZERO(&cpuset); // Initialize the CPU set + + CPU_SET(core_no, &cpuset); // Set the specified core + + // Set the thread's CPU affinity + int result = pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset); + if (result != 0) { + printf("failed to set core_no affinity: %d\n", core_no); + perror("set_affinity"); + exit (1); + } +} +#endif + static void set_numa_thread_affinity(int thread_n) { if (!ggml_is_numa()) { return; @@ -2095,6 +2208,11 @@ static void set_numa_thread_affinity(int thread_n) { fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n",strerror(rv)); } return; +#ifdef GGML_USE_NUMA_MIGRATE + case GGML_NUMA_STRATEGY_MIGRATE: + set_numa_migrate_affinity(thread_n); + return; +#endif default: return; } @@ -2840,11 +2958,37 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { /*.threadpool=*/ tp, }; +#ifdef GGML_USE_NUMA_MIGRATE_DEBUG + struct timespec t0, t1, t2, t3, t4; + long d12, d32, d43; + bool log_time = true; + int log_node_n = 0; + if (log_time) { + clock_gettime(CLOCK_MONOTONIC, &t0); + } +#endif + for (int node_n = 0; node_n < cgraph->n_nodes && atomic_load_explicit(&tp->abort, memory_order_relaxed) != node_n; node_n++) { struct ggml_tensor * node = cgraph->nodes[node_n]; - +#ifdef GGML_USE_NUMA_MIGRATE_DEBUG + if ((node->op == GGML_OP_MUL_MAT)) { + log_node_n = node_n; + log_time = true; + } else { + log_time = false; + } + if (log_time) { + clock_gettime(CLOCK_MONOTONIC, &t1); + } +#endif ggml_compute_forward(¶ms, node); +#ifdef GGML_USE_NUMA_MIGRATE_DEBUG + if (log_time) { + clock_gettime(CLOCK_MONOTONIC, &t2); + } +#endif + if (state->ith == 0 && cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) { atomic_store_explicit(&tp->abort, node_n + 1, memory_order_relaxed); @@ -2852,11 +2996,28 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { } if (node_n + 1 < cgraph->n_nodes) { - ggml_barrier(state->threadpool); +#ifdef GGML_USE_NUMA_MIGRATE_DEBUG + if (log_time) { + clock_gettime(CLOCK_MONOTONIC, &t3); + } +#endif + + ggml_barrier_numa_aware(state->threadpool, state->ith, node_n); + +#ifdef GGML_USE_NUMA_MIGRATE_DEBUG + if (log_time) { + clock_gettime(CLOCK_MONOTONIC, &t4); + d12 = (t2.tv_sec - t1.tv_sec) * 1e9 + (t2.tv_nsec - t1.tv_nsec); + d32 = (t3.tv_sec - t2.tv_sec) * 1e9 + (t3.tv_nsec - t2.tv_nsec); + d43 = (t4.tv_sec - t3.tv_sec) * 1e9 + (t4.tv_nsec - t3.tv_nsec); + printf("%s, op: %d, ith: %d, cpu: %d, d12: %ld, d32: %ld, d43: %ld, t1: %ld, t2: %ld, t3: %ld, t4: %ld\n", \ + __func__, node->op, state->ith, sched_getcpu(), d12, d32, d43, t1.tv_nsec, t2.tv_nsec, t3.tv_nsec, t4.tv_nsec); + } +#endif } } - ggml_barrier(state->threadpool); + ggml_barrier_numa_aware(state->threadpool, state->ith, GGML_BARRIER_NODE_LAST); return 0; } @@ -3021,6 +3182,21 @@ static struct ggml_threadpool * ggml_threadpool_new_impl( threadpool->n_graph = 0; threadpool->n_barrier = 0; threadpool->n_barrier_passed = 0; + +#ifdef GGML_USE_NUMA_MIGRATE + for (int node = 0; node < GGML_NUMA_MIGRATE_NODES; node++) { + threadpool->n_barrier_node[node] = (atomic_int *)numa_alloc_onnode(sizeof(atomic_int), node); + *threadpool->n_barrier_node[node] = 0; + threadpool->n_barrier_passed_node[node] = (atomic_int *)numa_alloc_onnode(sizeof(atomic_int), node); + *threadpool->n_barrier_passed_node[node] = 0; + } + + threadpool->n_barrier_passed_last = (atomic_int *)malloc((threadpool->cgraph->n_nodes + 1) * sizeof(atomic_int)); + for (int i = 0; i < threadpool->cgraph->n_nodes + 1; i++) { + threadpool->n_barrier_passed_last[i] = 0; + } +#endif + threadpool->current_chunk = 0; threadpool->stop = false; threadpool->pause = tpp->paused; diff --git a/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp b/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp index 15f0cd1540686..27f3d1130e4e6 100644 --- a/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +++ b/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp @@ -413,7 +413,11 @@ static ggml_backend_buffer_t ggml_backend_cpu_kleidiai_buffer_type_alloc_buffer( } static size_t ggml_backend_cpu_kleidiai_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { +#ifdef GGML_USE_NUMA_MIGRATE + return ggml_backend_get_page_size(); +#else return TENSOR_ALIGNMENT; +#endif GGML_UNUSED(buft); } diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index ddb1b03675b28..e32700f2fc9b4 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -686,6 +686,10 @@ llama_model_loader::llama_model_loader( use_mmap = false; } +#ifdef GGML_USE_NUMA_MIGRATE + use_mmap = false; +#endif + this->use_mmap = use_mmap; this->check_tensors = check_tensors; } diff --git a/tools/llama-bench/llama-bench.cpp b/tools/llama-bench/llama-bench.cpp index 06196cf24fc89..81dbaeac4cf59 100644 --- a/tools/llama-bench/llama-bench.cpp +++ b/tools/llama-bench/llama-bench.cpp @@ -312,7 +312,12 @@ static void print_usage(int /* argc */, char ** argv) { printf("\n"); printf("options:\n"); printf(" -h, --help\n"); +#ifdef GGML_USE_NUMA_MIGRATE + printf(" --numa \n"); + printf(" numa mode (default: disabled)\n"); +#else printf(" --numa numa mode (default: disabled)\n"); +#endif printf(" -r, --repetitions number of times to repeat each test (default: %d)\n", cmd_params_defaults.reps); printf(" --prio <0|1|2|3> process/thread priority (default: %d)\n", @@ -628,6 +633,10 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { params.numa = GGML_NUMA_STRATEGY_ISOLATE; } else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; +#ifdef GGML_USE_NUMA_MIGRATE + } else if (value == "migrate") { + params.numa = GGML_NUMA_STRATEGY_MIGRATE; +#endif } else { invalid_param = true; break; From 23fe7d38d2c591ea2f7f554bfbb41e23f09cfbb5 Mon Sep 17 00:00:00 2001 From: bolt Date: Tue, 20 May 2025 10:04:52 +0000 Subject: [PATCH 2/8] fix the threads number larger than numa node numbers --- ggml/src/ggml-cpu/ggml-cpu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index e2e552a6a0681..807ae1b15fad3 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -593,7 +593,7 @@ void ggml_barrier_numa_aware(struct ggml_threadpool * tp, int ith, int node_n) { int cores_per_numa = g_state.numa.nodes[0].n_cpus; int numa_nodes = n_threads / cores_per_numa; int remaining_cores = n_threads % cores_per_numa; - if (numa_nodes <= 1 || remaining_cores) { + if ((numa_nodes != GGML_NUMA_MIGRATE_NODES) || remaining_cores) { ggml_barrier(tp); return; } From e5cb47d8073dd44e77dac2b54afd35ebf449220e Mon Sep 17 00:00:00 2001 From: bolt Date: Wed, 21 May 2025 02:56:50 +0000 Subject: [PATCH 3/8] fix the buffer allocate size for NUMA page migration --- ggml/src/ggml-alloc.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c index 5fd379f6a9461..5d0d1331a38af 100644 --- a/ggml/src/ggml-alloc.c +++ b/ggml/src/ggml-alloc.c @@ -948,6 +948,22 @@ static bool alloc_tensor_range(struct ggml_context * ctx, ggml_backend_buffer_type_t buft, size_t size, ggml_backend_buffer_t ** buffers, size_t * n_buffers) { +#ifdef GGML_USE_NUMA_MIGRATE + size_t num_of_tensors = 0; + for (struct ggml_tensor * t = first; t != last; t = ggml_get_next_tensor(ctx, t)) { + if (t->data == NULL) { + if (t->view_src == NULL) { + num_of_tensors++; + } + } + } + size_t ps = ggml_backend_get_page_size(); + size_t original_size = size; + size += ps * num_of_tensors; + GGML_LOG_DEBUG("alloc buffer for NUMA page migration, num of tensors: %ld, size increased from %ld to %ld, increased %ld MiB\n", + num_of_tensors, original_size, size, (size - original_size) / 1024 / 1024); +#endif + ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, size); if (buffer == NULL) { GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(buft), size); From ab80f55a5a29738a1370fdf67ab5251cf53b6745 Mon Sep 17 00:00:00 2001 From: bolt Date: Wed, 21 May 2025 04:33:30 +0000 Subject: [PATCH 4/8] fix the cgraph null issue when running with llama-bench --- ggml/src/ggml-cpu/ggml-cpu-impl.h | 6 +++++- ggml/src/ggml-cpu/ggml-cpu.c | 10 +++------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/ggml/src/ggml-cpu/ggml-cpu-impl.h b/ggml/src/ggml-cpu/ggml-cpu-impl.h index 486f11a2b582a..6f7cb928554c1 100644 --- a/ggml/src/ggml-cpu/ggml-cpu-impl.h +++ b/ggml/src/ggml-cpu/ggml-cpu-impl.h @@ -506,7 +506,11 @@ static __m256 __lasx_xvreplfr2vr_s(const float val) { // TODO: move to ggml-threading void ggml_barrier(struct ggml_threadpool * tp); -#define GGML_BARRIER_NODE_LAST -1 +enum ggml_barrier_node_index { + GGML_BARRIER_NODE_PING = 0, + GGML_BARRIER_NODE_PONG = 1, + GGML_BARRIER_NODE_LAST = 2, +}; void ggml_barrier_numa_aware(struct ggml_threadpool * tp, int ith, int node_n); #ifdef __cplusplus diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index 807ae1b15fad3..49c9567480770 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -598,10 +598,6 @@ void ggml_barrier_numa_aware(struct ggml_threadpool * tp, int ith, int node_n) { return; } - if (node_n == GGML_BARRIER_NODE_LAST) { - node_n = tp->cgraph->n_nodes; - } - int node = get_node_from_cpu(ith, cores_per_numa); int n_passed = atomic_load_explicit(tp->n_barrier_passed_node[node], memory_order_relaxed); @@ -3002,7 +2998,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { } #endif - ggml_barrier_numa_aware(state->threadpool, state->ith, node_n); + ggml_barrier_numa_aware(state->threadpool, state->ith, node_n % GGML_BARRIER_NODE_LAST); #ifdef GGML_USE_NUMA_MIGRATE_DEBUG if (log_time) { @@ -3191,8 +3187,8 @@ static struct ggml_threadpool * ggml_threadpool_new_impl( *threadpool->n_barrier_passed_node[node] = 0; } - threadpool->n_barrier_passed_last = (atomic_int *)malloc((threadpool->cgraph->n_nodes + 1) * sizeof(atomic_int)); - for (int i = 0; i < threadpool->cgraph->n_nodes + 1; i++) { + threadpool->n_barrier_passed_last = (atomic_int *)malloc(GGML_BARRIER_NODE_LAST * sizeof(atomic_int)); + for (int i = 0; i < GGML_BARRIER_NODE_LAST; i++) { threadpool->n_barrier_passed_last[i] = 0; } #endif From 3e3a878a3180df0a8d7a97182557e53cf3da2d4b Mon Sep 17 00:00:00 2001 From: bolt Date: Wed, 21 May 2025 08:17:07 +0000 Subject: [PATCH 5/8] use ggml_barrier instead of ggml_barrier_numa_aware for src1 data barrier as the barrier only happens in few cores --- ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp b/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp index e5c1ccc2de9a0..3fcfa9e89a6bf 100644 --- a/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +++ b/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp @@ -6111,7 +6111,7 @@ template data + i11 * nb11), (void *) (wdata + i11 * nbw1), ne10); } - ggml_barrier_numa_aware(params->threadpool, ith, GGML_BARRIER_NODE_LAST); + ggml_barrier(params->threadpool); const void * src1_wdata = params->wdata; const size_t src1_col_stride = ggml_row_size(PARAM_TYPE, ne10); @@ -6219,7 +6219,7 @@ template threadpool, ith, GGML_BARRIER_NODE_LAST); + ggml_barrier(params->threadpool); // compute each matrix multiplication in sequence for (int cur_a = 0; cur_a < n_as; ++cur_a) { From ed4d9873b41ae6576622ab1fb990f8f74fff3f0a Mon Sep 17 00:00:00 2001 From: bolt Date: Thu, 22 May 2025 09:27:57 +0000 Subject: [PATCH 6/8] remove debug code for ggml barrier --- ggml/src/ggml-cpu/ggml-cpu.c | 44 ------------------------------------ 1 file changed, 44 deletions(-) diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index 49c9567480770..b89a674b15892 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -2954,37 +2954,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { /*.threadpool=*/ tp, }; -#ifdef GGML_USE_NUMA_MIGRATE_DEBUG - struct timespec t0, t1, t2, t3, t4; - long d12, d32, d43; - bool log_time = true; - int log_node_n = 0; - if (log_time) { - clock_gettime(CLOCK_MONOTONIC, &t0); - } -#endif - for (int node_n = 0; node_n < cgraph->n_nodes && atomic_load_explicit(&tp->abort, memory_order_relaxed) != node_n; node_n++) { struct ggml_tensor * node = cgraph->nodes[node_n]; -#ifdef GGML_USE_NUMA_MIGRATE_DEBUG - if ((node->op == GGML_OP_MUL_MAT)) { - log_node_n = node_n; - log_time = true; - } else { - log_time = false; - } - if (log_time) { - clock_gettime(CLOCK_MONOTONIC, &t1); - } -#endif ggml_compute_forward(¶ms, node); -#ifdef GGML_USE_NUMA_MIGRATE_DEBUG - if (log_time) { - clock_gettime(CLOCK_MONOTONIC, &t2); - } -#endif - if (state->ith == 0 && cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) { atomic_store_explicit(&tp->abort, node_n + 1, memory_order_relaxed); @@ -2992,24 +2965,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { } if (node_n + 1 < cgraph->n_nodes) { -#ifdef GGML_USE_NUMA_MIGRATE_DEBUG - if (log_time) { - clock_gettime(CLOCK_MONOTONIC, &t3); - } -#endif - ggml_barrier_numa_aware(state->threadpool, state->ith, node_n % GGML_BARRIER_NODE_LAST); - -#ifdef GGML_USE_NUMA_MIGRATE_DEBUG - if (log_time) { - clock_gettime(CLOCK_MONOTONIC, &t4); - d12 = (t2.tv_sec - t1.tv_sec) * 1e9 + (t2.tv_nsec - t1.tv_nsec); - d32 = (t3.tv_sec - t2.tv_sec) * 1e9 + (t3.tv_nsec - t2.tv_nsec); - d43 = (t4.tv_sec - t3.tv_sec) * 1e9 + (t4.tv_nsec - t3.tv_nsec); - printf("%s, op: %d, ith: %d, cpu: %d, d12: %ld, d32: %ld, d43: %ld, t1: %ld, t2: %ld, t3: %ld, t4: %ld\n", \ - __func__, node->op, state->ith, sched_getcpu(), d12, d32, d43, t1.tv_nsec, t2.tv_nsec, t3.tv_nsec, t4.tv_nsec); - } -#endif } } From e9c93714371c994ccb2db6a18883faf5f5c1ccc3 Mon Sep 17 00:00:00 2001 From: bolt Date: Tue, 3 Jun 2025 17:48:51 +0800 Subject: [PATCH 7/8] opt the src1 tensor data with local numa data, 33% uplift for tg128 --- ggml/include/ggml-cpu.h | 3 +++ ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp | 30 +++++++++++++++++++++----- ggml/src/ggml-cpu/ggml-cpu-impl.h | 6 ++++++ ggml/src/ggml-cpu/ggml-cpu.c | 18 ++++++++++++++++ ggml/src/ggml-cpu/ggml-cpu.cpp | 29 +++++++++++++++++++++++++ 5 files changed, 81 insertions(+), 5 deletions(-) diff --git a/ggml/include/ggml-cpu.h b/ggml/include/ggml-cpu.h index 54c24ec537cac..79f93910e0d88 100644 --- a/ggml/include/ggml-cpu.h +++ b/ggml/include/ggml-cpu.h @@ -12,6 +12,9 @@ extern "C" { struct ggml_cplan { size_t work_size; // size of work buffer, calculated by `ggml_graph_plan()` uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()` +#ifdef GGML_USE_NUMA_MIGRATE + uint8_t * work_data_numa[GGML_NUMA_MIGRATE_NODES]; +#endif int n_threads; struct ggml_threadpool * threadpool; diff --git a/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp b/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp index 3fcfa9e89a6bf..586116ba2c9b9 100644 --- a/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +++ b/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp @@ -15,6 +15,7 @@ #include #include // for qsort #include // for GGML_ASSERT +#include #include "ggml-cpu-aarch64.h" @@ -6094,7 +6095,12 @@ template src[0]) == 2); // GGML_ASSERT(ggml_n_dims(op->src[1]) == 2); +#ifdef GGML_USE_NUMA_MIGRATE + int node_id = numa_node_of_cpu(ith); + char * wdata = static_cast(params->wdata_numa[node_id]); +#else char * wdata = static_cast(params->wdata); +#endif const size_t nbw1 = ggml_row_size(PARAM_TYPE, ne10); assert(params->wsize >= nbw1 * ne11); @@ -6102,18 +6108,32 @@ template from_float; int64_t i11_processed = 0; - for (int64_t i11 = ith * 4; i11 < ne11 - ne11 % 4; i11 += nth * 4) { +#ifdef GGML_USE_NUMA_MIGRATE + int round_cnts = ggml_cores_per_numa(); + int start_id = ith - round_cnts * node_id; + if (round_cnts == 0) { + round_cnts = nth; + start_id = ith; + } +#else + int round_cnts = nth; + int start_id = ith; +#endif + for (int64_t i11 = start_id * 4; i11 < ne11 - ne11 % 4; i11 += round_cnts * 4) { ggml_quantize_mat_t((float *) ((char *) src1->data + i11 * nb11), (void *) (wdata + i11 * nbw1), 4, ne10); } i11_processed = ne11 - ne11 % 4; - for (int64_t i11 = i11_processed + ith; i11 < ne11; i11 += nth) { + for (int64_t i11 = i11_processed + start_id; i11 < ne11; i11 += round_cnts) { from_float((float *) ((char *) src1->data + i11 * nb11), (void *) (wdata + i11 * nbw1), ne10); } +#ifdef GGML_USE_NUMA_MIGRATE + ggml_barrier_numa_aware(params->threadpool, ith, GGML_BARRIER_NODE_LAST); +#else ggml_barrier(params->threadpool); +#endif - const void * src1_wdata = params->wdata; const size_t src1_col_stride = ggml_row_size(PARAM_TYPE, ne10); int64_t src0_start = (ith * ne01) / nth; int64_t src0_end = ((ith + 1) * ne01) / nth; @@ -6128,13 +6148,13 @@ template (ne00, (float *) ((char *) dst->data) + src0_start, ne01, (const char *) src0->data + src0_start * nb01, - (const char *) src1_wdata, ne11 - ne11 % 4, src0_end - src0_start); + (const char *) wdata, ne11 - ne11 % 4, src0_end - src0_start); } for (int iter = ne11 - ne11 % 4; iter < ne11; iter++) { gemv(ne00, (float *) ((char *) dst->data + (iter * nb1)) + src0_start, ne01, (const char *) src0->data + src0_start * nb01, - (const char *) src1_wdata + (src1_col_stride * iter), 1, + (const char *) wdata + (src1_col_stride * iter), 1, src0_end - src0_start); } } diff --git a/ggml/src/ggml-cpu/ggml-cpu-impl.h b/ggml/src/ggml-cpu/ggml-cpu-impl.h index 6f7cb928554c1..a3c633a6c979a 100644 --- a/ggml/src/ggml-cpu/ggml-cpu-impl.h +++ b/ggml/src/ggml-cpu/ggml-cpu-impl.h @@ -22,6 +22,9 @@ struct ggml_compute_params { // work buffer for all threads size_t wsize; void * wdata; +#ifdef GGML_USE_NUMA_MIGRATE + void * wdata_numa[GGML_NUMA_MIGRATE_NODES]; +#endif struct ggml_threadpool * threadpool; }; @@ -512,6 +515,9 @@ enum ggml_barrier_node_index { GGML_BARRIER_NODE_LAST = 2, }; void ggml_barrier_numa_aware(struct ggml_threadpool * tp, int ith, int node_n); +#ifdef GGML_USE_NUMA_MIGRATE +int ggml_cores_per_numa(void); +#endif #ifdef __cplusplus } diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index b89a674b15892..994d4a56d12af 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -571,6 +571,10 @@ void ggml_barrier(struct ggml_threadpool * tp) { static int get_node_from_cpu(int cpu, int cores_per_numa) { return cpu / cores_per_numa; } + +int ggml_cores_per_numa(void) { + return g_state.numa.nodes[0].n_cpus; +} #endif void ggml_barrier_numa_aware(struct ggml_threadpool * tp, int ith, int node_n) { @@ -2933,6 +2937,11 @@ struct ggml_cplan ggml_graph_plan( cplan.n_threads = MIN(max_tasks, n_threads); cplan.work_size = work_size; cplan.work_data = NULL; +#ifdef GGML_USE_NUMA_MIGRATE + for (int i = 0; i < GGML_NUMA_MIGRATE_NODES; i++) { + cplan.work_data_numa[i] = NULL; + } +#endif return cplan; } @@ -2951,9 +2960,18 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { /*.nth =*/ atomic_load_explicit(&tp->n_threads_cur, memory_order_relaxed), /*.wsize =*/ cplan->work_size, /*.wdata =*/ cplan->work_data, +#ifdef GGML_USE_NUMA_MIGRATE + /*.wdata_numa =*/ {NULL, NULL}, +#endif /*.threadpool=*/ tp, }; +#ifdef GGML_USE_NUMA_MIGRATE + for (int i = 0; i < GGML_NUMA_MIGRATE_NODES; i++) { + params.wdata_numa[i] = cplan->work_data_numa[numa_node_of_cpu(state->ith)]; + } +#endif + for (int node_n = 0; node_n < cgraph->n_nodes && atomic_load_explicit(&tp->abort, memory_order_relaxed) != node_n; node_n++) { struct ggml_tensor * node = cgraph->nodes[node_n]; ggml_compute_forward(¶ms, node); diff --git a/ggml/src/ggml-cpu/ggml-cpu.cpp b/ggml/src/ggml-cpu/ggml-cpu.cpp index e013e8b416222..a8b3b2d26ccc3 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.cpp +++ b/ggml/src/ggml-cpu/ggml-cpu.cpp @@ -9,6 +9,7 @@ #include #include #include +#include #ifdef GGML_USE_CPU_HBM # include "ggml-cpu-hbm.h" @@ -87,6 +88,9 @@ struct ggml_backend_cpu_context { ggml_threadpool_t threadpool; uint8_t * work_data; +#ifdef GGML_USE_NUMA_MIGRATE + uint8_t * work_data_numa[GGML_NUMA_MIGRATE_NODES]; +#endif size_t work_size; ggml_abort_callback abort_callback; @@ -102,6 +106,11 @@ static const char * ggml_backend_cpu_get_name(ggml_backend_t backend) { static void ggml_backend_cpu_free(ggml_backend_t backend) { struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context; delete[] cpu_ctx->work_data; +#ifdef GGML_USE_NUMA_MIGRATE + for (int i = 0; i < GGML_NUMA_MIGRATE_NODES; i++) { + numa_free(cpu_ctx->work_data_numa[i], cpu_ctx->work_size); + } +#endif delete cpu_ctx; delete backend; } @@ -162,9 +171,24 @@ static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, s cpu_ctx->work_size = 0; return GGML_STATUS_ALLOC_FAILED; } + +#ifdef GGML_USE_NUMA_MIGRATE + for (int i = 0; i < GGML_NUMA_MIGRATE_NODES; i++) { + cpu_ctx->work_data_numa[i] = (uint8_t *)numa_alloc_onnode(cplan.work_size, i); + if (cpu_ctx->work_data_numa[i] == NULL) { + cpu_ctx->work_size = 0; + return GGML_STATUS_ALLOC_FAILED; + } + } +#endif cpu_ctx->work_size = cplan.work_size; } cplan.work_data = (uint8_t *)cpu_ctx->work_data; +#ifdef GGML_USE_NUMA_MIGRATE + for (int i = 0; i < GGML_NUMA_MIGRATE_NODES; i++) { + cplan.work_data_numa[i] = (uint8_t *)(cpu_ctx->work_data_numa[i]); + } +#endif cplan.abort_callback = cpu_ctx->abort_callback; cplan.abort_callback_data = cpu_ctx->abort_callback_data; @@ -205,6 +229,11 @@ ggml_backend_t ggml_backend_cpu_init(void) { ctx->n_threads = GGML_DEFAULT_N_THREADS; ctx->threadpool = NULL; ctx->work_data = NULL; +#ifdef GGML_USE_NUMA_MIGRATE + for (int i = 0; i < GGML_NUMA_MIGRATE_NODES; i++) { + ctx->work_data_numa[i] = NULL; + } +#endif ctx->work_size = 0; ctx->abort_callback = NULL; ctx->abort_callback_data = NULL; From 88358235e186d2522e257193887518bcfab83245 Mon Sep 17 00:00:00 2001 From: bolt Date: Thu, 12 Jun 2025 13:40:55 +0800 Subject: [PATCH 8/8] fix the perf regression with numa_node_of_cpu() system call and the n_barrier_passed_last indexing issue --- ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp | 2 +- ggml/src/ggml-cpu/ggml-cpu-impl.h | 2 ++ ggml/src/ggml-cpu/ggml-cpu.c | 24 ++++++++++++------------ 3 files changed, 15 insertions(+), 13 deletions(-) diff --git a/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp b/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp index 586116ba2c9b9..2ffee98171e98 100644 --- a/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +++ b/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp @@ -6096,7 +6096,7 @@ template src[1]) == 2); #ifdef GGML_USE_NUMA_MIGRATE - int node_id = numa_node_of_cpu(ith); + int node_id = ggml_get_node_from_cpu(ith); char * wdata = static_cast(params->wdata_numa[node_id]); #else char * wdata = static_cast(params->wdata); diff --git a/ggml/src/ggml-cpu/ggml-cpu-impl.h b/ggml/src/ggml-cpu/ggml-cpu-impl.h index a3c633a6c979a..e74b73b29133d 100644 --- a/ggml/src/ggml-cpu/ggml-cpu-impl.h +++ b/ggml/src/ggml-cpu/ggml-cpu-impl.h @@ -513,10 +513,12 @@ enum ggml_barrier_node_index { GGML_BARRIER_NODE_PING = 0, GGML_BARRIER_NODE_PONG = 1, GGML_BARRIER_NODE_LAST = 2, + GGML_BARRIER_NODE_CNTS = 3 }; void ggml_barrier_numa_aware(struct ggml_threadpool * tp, int ith, int node_n); #ifdef GGML_USE_NUMA_MIGRATE int ggml_cores_per_numa(void); +int ggml_get_node_from_cpu(int cpu); #endif #ifdef __cplusplus diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index 994d4a56d12af..a76477b8a8b3c 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -445,7 +445,7 @@ struct ggml_threadpool { #ifdef GGML_USE_NUMA_MIGRATE atomic_int GGML_CACHE_ALIGN *n_barrier_node[GGML_NUMA_MIGRATE_NODES]; atomic_int GGML_CACHE_ALIGN *n_barrier_passed_node[GGML_NUMA_MIGRATE_NODES]; - atomic_int GGML_CACHE_ALIGN *n_barrier_passed_last; + atomic_int GGML_CACHE_ALIGN *n_barrier_passed_last[GGML_BARRIER_NODE_CNTS]; #endif atomic_int GGML_CACHE_ALIGN current_chunk; // currently processing chunk during Mat_Mul, shared between all the threads. @@ -568,8 +568,8 @@ void ggml_barrier(struct ggml_threadpool * tp) { } #ifdef GGML_USE_NUMA_MIGRATE -static int get_node_from_cpu(int cpu, int cores_per_numa) { - return cpu / cores_per_numa; +int ggml_get_node_from_cpu(int cpu) { + return cpu / g_state.numa.nodes[0].n_cpus; } int ggml_cores_per_numa(void) { @@ -594,7 +594,7 @@ void ggml_barrier_numa_aware(struct ggml_threadpool * tp, int ith, int node_n) { return; } - int cores_per_numa = g_state.numa.nodes[0].n_cpus; + int cores_per_numa = ggml_cores_per_numa(); int numa_nodes = n_threads / cores_per_numa; int remaining_cores = n_threads % cores_per_numa; if ((numa_nodes != GGML_NUMA_MIGRATE_NODES) || remaining_cores) { @@ -602,7 +602,7 @@ void ggml_barrier_numa_aware(struct ggml_threadpool * tp, int ith, int node_n) { return; } - int node = get_node_from_cpu(ith, cores_per_numa); + int node = ggml_get_node_from_cpu(ith); int n_passed = atomic_load_explicit(tp->n_barrier_passed_node[node], memory_order_relaxed); @@ -613,13 +613,13 @@ void ggml_barrier_numa_aware(struct ggml_threadpool * tp, int ith, int node_n) { // last thread of current numa node atomic_store_explicit(tp->n_barrier_node[node], 0, memory_order_seq_cst); - int n_passed_node = atomic_fetch_add_explicit(&tp->n_barrier_passed_last[node_n], 1, memory_order_seq_cst); + int n_passed_node = atomic_fetch_add_explicit(tp->n_barrier_passed_last[node_n], 1, memory_order_seq_cst); if (n_passed_node == (numa_nodes - 1)) { // last numa node cpu atomic_fetch_add_explicit(tp->n_barrier_passed_node[node], 1, memory_order_seq_cst); - atomic_store_explicit(&tp->n_barrier_passed_last[node_n], 0, memory_order_seq_cst); + atomic_store_explicit(tp->n_barrier_passed_last[node_n], 0, memory_order_seq_cst); } else { - while (atomic_load_explicit(&tp->n_barrier_passed_last[node_n], memory_order_relaxed)) { + while (atomic_load_explicit(tp->n_barrier_passed_last[node_n], memory_order_relaxed)) { ggml_thread_cpu_relax(); } atomic_fetch_add_explicit(tp->n_barrier_passed_node[node], 1, memory_order_seq_cst); @@ -2968,7 +2968,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { #ifdef GGML_USE_NUMA_MIGRATE for (int i = 0; i < GGML_NUMA_MIGRATE_NODES; i++) { - params.wdata_numa[i] = cplan->work_data_numa[numa_node_of_cpu(state->ith)]; + params.wdata_numa[i] = cplan->work_data_numa[ggml_get_node_from_cpu(state->ith)]; } #endif @@ -3161,9 +3161,9 @@ static struct ggml_threadpool * ggml_threadpool_new_impl( *threadpool->n_barrier_passed_node[node] = 0; } - threadpool->n_barrier_passed_last = (atomic_int *)malloc(GGML_BARRIER_NODE_LAST * sizeof(atomic_int)); - for (int i = 0; i < GGML_BARRIER_NODE_LAST; i++) { - threadpool->n_barrier_passed_last[i] = 0; + for (int i = 0; i < GGML_BARRIER_NODE_CNTS; i++) { + threadpool->n_barrier_passed_last[i] = (atomic_int *)malloc(sizeof(atomic_int)); + *threadpool->n_barrier_passed_last[i] = 0; } #endif