Skip to content

add GGML_USE_NUMA_MIGRATE feature to optimize cross NUMA op computation #13649

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 8 commits into from
6 changes: 6 additions & 0 deletions common/arg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2276,12 +2276,18 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
"- distribute: spread execution evenly over all nodes\n"
"- isolate: only spawn threads on CPUs on the node that execution started on\n"
"- numactl: use the CPU map provided by numactl\n"
#ifdef GGML_USE_NUMA_MIGRATE
"- migrate: for affinity threads with page migration across NUMA nodes\n"
#endif
"if run without this previously, it is recommended to drop the system page cache before using this\n"
"see https://github.com/ggml-org/llama.cpp/issues/1437",
[](common_params & params, const std::string & value) {
/**/ if (value == "distribute" || value == "") { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; }
else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; }
else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; }
#ifdef GGML_USE_NUMA_MIGRATE
else if (value == "migrate") { params.numa = GGML_NUMA_STRATEGY_MIGRATE; }
#endif
else { throw std::invalid_argument("invalid value"); }
}
).set_env("LLAMA_ARG_NUMA"));
Expand Down
5 changes: 5 additions & 0 deletions ggml/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,11 @@ set(GGML_BLAS_VENDOR ${GGML_BLAS_VENDOR_DEFAULT} CACHE STRING
"ggml: BLAS library vendor")
option(GGML_LLAMAFILE "ggml: use LLAMAFILE" ${GGML_LLAMAFILE_DEFAULT})

option(GGML_NUMA_MIGRATE "ggml: use NUMA_MIGRATE" OFF)
set(GGML_NUMA_MIGRATE_NODES "2" CACHE STRING
"ggml: the number of NUMA nodes during page migration")
option(GGML_NUMA_MIGRATE_DEBUG "ggml: enable debugging of NUMA_MIGRATE" OFF)

option(GGML_CUDA "ggml: use CUDA" OFF)
option(GGML_MUSA "ggml: use MUSA" OFF)
option(GGML_CUDA_FORCE_MMQ "ggml: use mmq kernels instead of cuBLAS" OFF)
Expand Down
3 changes: 3 additions & 0 deletions ggml/include/ggml-backend.h
Original file line number Diff line number Diff line change
Expand Up @@ -348,6 +348,9 @@ extern "C" {
// CPU buffer types are always available
GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
#ifdef GGML_USE_NUMA_MIGRATE
GGML_API size_t ggml_backend_get_page_size(void);
#endif

#ifdef __cplusplus
}
Expand Down
6 changes: 6 additions & 0 deletions ggml/include/ggml-cpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@ extern "C" {
struct ggml_cplan {
size_t work_size; // size of work buffer, calculated by `ggml_graph_plan()`
uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
#ifdef GGML_USE_NUMA_MIGRATE
uint8_t * work_data_numa[GGML_NUMA_MIGRATE_NODES];
#endif

int n_threads;
struct ggml_threadpool * threadpool;
Expand All @@ -28,6 +31,9 @@ extern "C" {
GGML_NUMA_STRATEGY_ISOLATE = 2,
GGML_NUMA_STRATEGY_NUMACTL = 3,
GGML_NUMA_STRATEGY_MIRROR = 4,
#ifdef GGML_USE_NUMA_MIGRATE
GGML_NUMA_STRATEGY_MIGRATE = 5,
#endif
GGML_NUMA_STRATEGY_COUNT
};

Expand Down
24 changes: 24 additions & 0 deletions ggml/src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -343,3 +343,27 @@ if (BUILD_SHARED_LIBS)
target_compile_definitions(${target} PUBLIC GGML_SHARED)
endforeach()
endif()

if (GGML_NUMA_MIGRATE)
find_path(NUMA_ROOT_DIR
NAMES include/numa.h
PATHS ENV NUMA_ROOT
DOC "NUMA root directory")

find_library(NUMA_LIBRARY
NAMES numa
HINTS ${NUMA_ROOT_DIR}
DOC "NUMA library")

if (NOT NUMA_LIBRARY)
message(FATAL_ERROR "Could NOT find NUMA library.")
endif()

if (GGML_NUMA_MIGRATE_DEBUG)
target_compile_definitions(ggml-base PUBLIC GGML_USE_NUMA_MIGRATE GGML_NUMA_MIGRATE_NODES=${GGML_NUMA_MIGRATE_NODES} GGML_USE_NUMA_MIGRATE_DEBUG)
else()
target_compile_definitions(ggml-base PUBLIC GGML_USE_NUMA_MIGRATE GGML_NUMA_MIGRATE_NODES=${GGML_NUMA_MIGRATE_NODES})
endif()

target_link_libraries(ggml-base PRIVATE ${NUMA_LIBRARY})
endif()
16 changes: 16 additions & 0 deletions ggml/src/ggml-alloc.c
Original file line number Diff line number Diff line change
Expand Up @@ -948,6 +948,22 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
ggml_backend_buffer_type_t buft, size_t size,
ggml_backend_buffer_t ** buffers, size_t * n_buffers) {

#ifdef GGML_USE_NUMA_MIGRATE
size_t num_of_tensors = 0;
for (struct ggml_tensor * t = first; t != last; t = ggml_get_next_tensor(ctx, t)) {
if (t->data == NULL) {
if (t->view_src == NULL) {
num_of_tensors++;
}
}
}
size_t ps = ggml_backend_get_page_size();
size_t original_size = size;
size += ps * num_of_tensors;
GGML_LOG_DEBUG("alloc buffer for NUMA page migration, num of tensors: %ld, size increased from %ld to %ld, increased %ld MiB\n",
num_of_tensors, original_size, size, (size - original_size) / 1024 / 1024);
#endif

ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, size);
if (buffer == NULL) {
GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(buft), size);
Expand Down
Loading