From 30715409c40bdbce8468d2988990b4bc4dfbdfa4 Mon Sep 17 00:00:00 2001 From: Borislav Stanimirov Date: Tue, 10 Oct 2023 09:38:39 +0300 Subject: [PATCH 01/20] cmake : don't find CUDA it it's already found --- src/CMakeLists.txt | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index b225597ed..cd1deaa65 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -196,7 +196,11 @@ endif() if (GGML_CUBLAS) cmake_minimum_required(VERSION 3.17) - find_package(CUDAToolkit) + if (NOT CUDAToolkit_FOUND) + # only find package if not already found by an external source + find_package(CUDAToolkit) + endif() + if (CUDAToolkit_FOUND) message(STATUS "cuBLAS found") From 9f9f46bfaa33f1970594d81de24d9fde92cb7f76 Mon Sep 17 00:00:00 2001 From: Borislav Stanimirov Date: Tue, 10 Oct 2023 09:40:04 +0300 Subject: [PATCH 02/20] cmake : if built as a plugin, use fPIC and build as static lib --- src/CMakeLists.txt | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index cd1deaa65..00e199d91 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -253,7 +253,12 @@ if (GGML_PERF) set(GGML_EXTRA_FLAGS ${GGML_EXTRA_FLAGS} -DGGML_PERF) endif() -add_library(${TARGET} +if (GGML_PLUGIN) + set(GGML_LIB_TYPE STATIC) + set(CMAKE_POSITION_INDEPENDENT_CODE ON) +endif() + +add_library(${TARGET} ${GGML_LIB_TYPE} ggml.c ggml-alloc.c ggml-backend.c @@ -265,6 +270,8 @@ add_library(${TARGET} ${GGML_METAL_SOURCES} ) +add_library(ggml::ggml ALIAS ggml) + target_include_directories(${TARGET} PUBLIC . ../include @@ -278,7 +285,7 @@ else() target_link_libraries(${TARGET} PUBLIC m ${GGML_EXTRA_LIBS} ${CMAKE_THREAD_LIBS_INIT}) endif() -if (BUILD_SHARED_LIBS) +if (BUILD_SHARED_LIBS AND NOT GGML_PLUGIN) set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON) target_link_libraries(${TARGET} PUBLIC From 7f337079b10fd232cdaf31a6218466c8941c002b Mon Sep 17 00:00:00 2001 From: Borislav Stanimirov Date: Tue, 10 Oct 2023 13:04:26 +0300 Subject: [PATCH 03/20] backend : add set_tensor_external_data --- include/ggml/ggml-backend.h | 8 ++++++++ src/ggml-backend.c | 11 +++++++++++ src/ggml-cuda.cu | 19 +++++++++++++++++++ 3 files changed, 38 insertions(+) diff --git a/include/ggml/ggml-backend.h b/include/ggml/ggml-backend.h index da134b0db..48d6c3f20 100644 --- a/include/ggml/ggml-backend.h +++ b/include/ggml/ggml-backend.h @@ -75,6 +75,12 @@ extern "C" { void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); void (*synchronize) (ggml_backend_t backend); + // set tensor data from external pointer (shallow copy) + // WARNING! It is the responsibility of the user to ensure that the provided pointer: + // * is compatible with the backend (same address space) + // * points to a memory of the right sie and type/quantization as described by the tensor + void (*set_tensor_external_data)(ggml_backend_t backend, struct ggml_tensor * tensor, void * data); + // (optional) copy tensor between different backends, allow for single-copy tranfers void (*cpy_tensor_from)(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst); void (*cpy_tensor_to) (ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst); @@ -116,6 +122,8 @@ extern "C" { GGML_API void ggml_backend_synchronize(ggml_backend_t backend); + GGML_API void ggml_backend_set_tensor_external_data(ggml_backend_t backend, struct ggml_tensor * tensor, void * data); + GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create (ggml_backend_t backend, struct ggml_cgraph * cgraph); GGML_API void ggml_backend_graph_plan_free (ggml_backend_t backend, ggml_backend_graph_plan_t plan); diff --git a/src/ggml-backend.c b/src/ggml-backend.c index ca8d83daf..ebcefe4ef 100644 --- a/src/ggml-backend.c +++ b/src/ggml-backend.c @@ -114,6 +114,10 @@ void ggml_backend_synchronize(ggml_backend_t backend) { backend->iface.synchronize(backend); } +void ggml_backend_set_tensor_external_data(ggml_backend_t backend, struct ggml_tensor * tensor, void * data) { + backend->iface.set_tensor_external_data(backend, tensor, data); +} + ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) { return backend->iface.graph_plan_create(backend, cgraph); } @@ -264,6 +268,12 @@ static void ggml_backend_cpu_synchronize(ggml_backend_t backend) { UNUSED(backend); } +static void ggml_backend_cpu_set_tensor_external_data(ggml_backend_t backend, struct ggml_tensor * tensor, void * data) { + GGML_ASSERT(tensor->buffer == NULL); + tensor->data = data; + UNUSED(backend); +} + static void ggml_backend_cpu_cpy_tensor_from(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst) { ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src)); @@ -344,6 +354,7 @@ static struct ggml_backend_i cpu_backend_i = { /* .set_tensor_async = */ ggml_backend_cpu_set_tensor_async, /* .get_tensor_async = */ ggml_backend_cpu_get_tensor_async, /* .synchronize = */ ggml_backend_cpu_synchronize, + /* .set_tensor_external_data = */ ggml_backend_cpu_set_tensor_external_data, /* .cpy_tensor_from = */ ggml_backend_cpu_cpy_tensor_from, /* .cpy_tensor_to = */ ggml_backend_cpu_cpy_tensor_to, /* .graph_plan_create = */ ggml_backend_cpu_graph_plan_create, diff --git a/src/ggml-cuda.cu b/src/ggml-cuda.cu index 5bd83bb5c..1d30d122f 100644 --- a/src/ggml-cuda.cu +++ b/src/ggml-cuda.cu @@ -7678,6 +7678,24 @@ static void ggml_backend_cuda_synchronize(ggml_backend_t backend) { UNUSED(backend); } +static void ggml_backend_cuda_set_tensor_external_data(ggml_backend_t backend, struct ggml_tensor * tensor, void * data) { + GGML_ASSERT(tensor->buffer == NULL); + + ggml_tensor_extra_gpu* extra = nullptr; + if (tensor->extra) { + extra = (ggml_tensor_extra_gpu *) extra; + GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); + } + else { + ggml_tensor_extra_gpu* extra = ggml_cuda_alloc_temp_tensor_extra(); + tensor->backend = GGML_BACKEND_GPU; + tensor->extra = extra; + } + + tensor->data = data; + extra->data_device[g_main_device] = tensor->data; +} + static ggml_backend_graph_plan_t ggml_backend_cuda_graph_plan_create(ggml_backend_t backend, ggml_cgraph * cgraph) { GGML_ASSERT(!"not implemented"); @@ -7758,6 +7776,7 @@ static ggml_backend_i cuda_backend_i = { /* .set_tensor_async = */ ggml_backend_cuda_set_tensor_async, /* .get_tensor_async = */ ggml_backend_cuda_get_tensor_async, /* .synchronize = */ ggml_backend_cuda_synchronize, + /* .set_tensor_external_data = */ ggml_backend_cuda_set_tensor_external_data, /* .cpy_tensor_from = */ nullptr, /* .cpy_tensor_to = */ nullptr, /* .graph_plan_create = */ ggml_backend_cuda_graph_plan_create, From ac9b0ba59fef86ec03f6f2b38c27e0f00d337886 Mon Sep 17 00:00:00 2001 From: Borislav Stanimirov Date: Tue, 10 Oct 2023 13:28:29 +0300 Subject: [PATCH 04/20] cuda : init backend as a plugin --- src/ggml-cuda.cu | 42 +++++++++++++++++++++++++++++++++++------- src/ggml-cuda.h | 1 + 2 files changed, 36 insertions(+), 7 deletions(-) diff --git a/src/ggml-cuda.cu b/src/ggml-cuda.cu index 1d30d122f..8fc8f1bdd 100644 --- a/src/ggml-cuda.cu +++ b/src/ggml-cuda.cu @@ -463,6 +463,8 @@ inline cudaError_t ggml_cuda_set_device(const int device) { return cudaSetDevice(device); } +static bool g_cublas_initialized = false; + static int g_device_count = -1; static int g_main_device = 0; static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES]; @@ -5632,9 +5634,7 @@ static void ggml_cuda_pool_free(void * ptr, size_t size) { void ggml_init_cublas() { - static bool initialized = false; - - if (!initialized) { + if (!g_cublas_initialized) { #ifdef __HIP_PLATFORM_AMD__ // Workaround for a rocBLAS bug when using multiple graphics cards: @@ -5655,9 +5655,9 @@ void ggml_init_cublas() { g_tensor_split[id] = total_vram; total_vram += prop.totalGlobalMem; #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) - g_compute_capabilities[id] = 100*prop.major + 10*prop.minor + CC_OFFSET_AMD; + g_compute_capabilities[id] = 100 * prop.major + 10 * prop.minor + CC_OFFSET_AMD; #else - g_compute_capabilities[id] = 100*prop.major + 10*prop.minor; + g_compute_capabilities[id] = 100 * prop.major + 10 * prop.minor; #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) } for (int id = 0; id < g_device_count; ++id) { @@ -5680,7 +5680,7 @@ void ggml_init_cublas() { // configure logging to stdout // CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr)); - initialized = true; + g_cublas_initialized = true; } } @@ -7687,7 +7687,7 @@ static void ggml_backend_cuda_set_tensor_external_data(ggml_backend_t backend, s GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); } else { - ggml_tensor_extra_gpu* extra = ggml_cuda_alloc_temp_tensor_extra(); + extra = ggml_cuda_alloc_temp_tensor_extra(); tensor->backend = GGML_BACKEND_GPU; tensor->extra = extra; } @@ -7798,3 +7798,31 @@ ggml_backend_t ggml_backend_cuda_init() { return cuda_backend; } + +ggml_backend_t ggml_backend_cuda_init_plugin(int main_device, void * cublas_handle, void * cuda_stream) { + GGML_ASSERT(g_cublas_initialized == false && "currently only a single cuda backend is supported"); + + g_device_count = main_device + 1; + int id = g_main_device = main_device; + + cudaDeviceProp prop; + CUDA_CHECK(cudaGetDeviceProperties(&prop, id)); + fprintf(stderr, " Device %d: %s, compute capability %d.%d\n", id, prop.name, prop.major, prop.minor); + + // g_tensor_split[id] = 0; + g_compute_capabilities[id] = 100 * prop.major + 10 * prop.minor; + g_cublas_handles[id] = (cublasHandle_t)cublas_handle; + g_cudaStreams[id][0] = (cudaStream_t)cuda_stream; + + g_cublas_initialized = true; + + ggml_backend_context_cuda* ctx = new ggml_backend_context_cuda; + + ggml_backend_t cuda_backend = new ggml_backend { + /* .interface = */ cuda_backend_i, + /* .context = */ ctx + }; + + return cuda_backend; + +} diff --git a/src/ggml-cuda.h b/src/ggml-cuda.h index 57adc9cf3..e542461a6 100644 --- a/src/ggml-cuda.h +++ b/src/ggml-cuda.h @@ -45,6 +45,7 @@ GGML_API void ggml_cuda_get_device_description(int device, char * description, // backend API GGML_API ggml_backend_t ggml_backend_cuda_init(void); // TODO: take a list of devices to use +GGML_API ggml_backend_t ggml_backend_cuda_init_plugin(int main_device, void * cublas_handle, void * cuda_stream); #ifdef __cplusplus } From 7396f84422186cc1a1c4cb2ba3f207f992994eb0 Mon Sep 17 00:00:00 2001 From: Borislav Stanimirov Date: Tue, 10 Oct 2023 13:57:01 +0300 Subject: [PATCH 05/20] cmake : rely on external CUDA if any to make CUDAToolkit available --- src/CMakeLists.txt | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 00e199d91..3abbfec82 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -196,11 +196,7 @@ endif() if (GGML_CUBLAS) cmake_minimum_required(VERSION 3.17) - if (NOT CUDAToolkit_FOUND) - # only find package if not already found by an external source - find_package(CUDAToolkit) - endif() - + find_package(CUDAToolkit) if (CUDAToolkit_FOUND) message(STATUS "cuBLAS found") From e1f7aedc81679adf21fbb040b0835bb958b74ecd Mon Sep 17 00:00:00 2001 From: Borislav Stanimirov Date: Tue, 10 Oct 2023 11:07:51 +0000 Subject: [PATCH 06/20] cuda : fix unused var warning --- src/ggml-cuda.cu | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/ggml-cuda.cu b/src/ggml-cuda.cu index 8fc8f1bdd..ec8695f9a 100644 --- a/src/ggml-cuda.cu +++ b/src/ggml-cuda.cu @@ -7694,6 +7694,8 @@ static void ggml_backend_cuda_set_tensor_external_data(ggml_backend_t backend, s tensor->data = data; extra->data_device[g_main_device] = tensor->data; + + UNUSED(backend); } static ggml_backend_graph_plan_t ggml_backend_cuda_graph_plan_create(ggml_backend_t backend, ggml_cgraph * cgraph) { @@ -7824,5 +7826,4 @@ ggml_backend_t ggml_backend_cuda_init_plugin(int main_device, void * cublas_hand }; return cuda_backend; - } From 95770f574cfd7cbedac60ee4e915fabb1326c2d5 Mon Sep 17 00:00:00 2001 From: Borislav Stanimirov Date: Tue, 10 Oct 2023 15:07:36 +0300 Subject: [PATCH 07/20] cuda : destroy resources on free --- src/ggml-cuda.cu | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/src/ggml-cuda.cu b/src/ggml-cuda.cu index ec8695f9a..28ffe29b1 100644 --- a/src/ggml-cuda.cu +++ b/src/ggml-cuda.cu @@ -7536,6 +7536,21 @@ static const char * ggml_backend_cuda_name(ggml_backend_t backend) { } static void ggml_backend_cuda_free(ggml_backend_t backend) { + for (int id = 0; id < GGML_CUDA_MAX_DEVICES; ++id) { + for (int is = 0; is < MAX_STREAMS; ++is) { + auto& stream = g_cudaStreams[id][is]; + if (!stream) break; + cudaStreamDestroy(stream); + stream = nullptr; + } + + auto& cublasHandle = g_cublas_handles[id]; + if (!cublasHandle) continue; + cublasDestroy(cublasHandle); + cublasHandle = nullptr; + } + g_cublas_initialized = false; + ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context; delete cuda_ctx; delete backend; From c36c63e4668a04cd526c7a5d9c442c0519f71316 Mon Sep 17 00:00:00 2001 From: Borislav Stanimirov Date: Wed, 11 Oct 2023 09:16:29 +0300 Subject: [PATCH 08/20] cuda : properly reuse tensor->extra --- src/ggml-cuda.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ggml-cuda.cu b/src/ggml-cuda.cu index 28ffe29b1..f4ccabd26 100644 --- a/src/ggml-cuda.cu +++ b/src/ggml-cuda.cu @@ -7698,8 +7698,8 @@ static void ggml_backend_cuda_set_tensor_external_data(ggml_backend_t backend, s ggml_tensor_extra_gpu* extra = nullptr; if (tensor->extra) { - extra = (ggml_tensor_extra_gpu *) extra; GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); + extra = (ggml_tensor_extra_gpu *) tensor->extra; } else { extra = ggml_cuda_alloc_temp_tensor_extra(); From 4103c514fe067d9f93888918ff7bba9a024c88f9 Mon Sep 17 00:00:00 2001 From: Borislav Stanimirov Date: Wed, 11 Oct 2023 09:19:49 +0300 Subject: [PATCH 09/20] metal : backend interface set_external_data --- src/ggml-metal.m | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/ggml-metal.m b/src/ggml-metal.m index 29cb3c922..54fa43cff 100644 --- a/src/ggml-metal.m +++ b/src/ggml-metal.m @@ -1528,6 +1528,12 @@ static void ggml_backend_metal_synchronize(ggml_backend_t backend) { UNUSED(backend); } +static void ggml_backend_metal_set_tensor_external_data(ggml_backend_t backend, struct ggml_tensor * tensor, void * data) { + GGML_ASSERT(tensor->buffer == NULL); + tensor->data = data; + UNUSED(backend); +} + static void ggml_backend_metal_cpy_tensor_from(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst) { ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src)); @@ -1560,6 +1566,7 @@ static bool ggml_backend_metal_supports_op(ggml_backend_t backend, const struct /* .set_tensor_async = */ ggml_backend_metal_set_tensor_async, /* .get_tensor_async = */ ggml_backend_metal_get_tensor_async, /* .synchronize = */ ggml_backend_metal_synchronize, + /* .set_tensor_external_data = */ ggml_backend_metal_set_tensor_external_data, /* .cpy_tensor_from = */ ggml_backend_metal_cpy_tensor_from, /* .cpy_tensor_to = */ ggml_backend_metal_cpy_tensor_to, /* .graph_plan_create = */ NULL, // the metal implementation does not require creating graph plans atm From acd2940c9f9d01a679a0fb80395f9ffab6bc2af4 Mon Sep 17 00:00:00 2001 From: Borislav Stanimirov Date: Wed, 11 Oct 2023 09:45:35 +0300 Subject: [PATCH 10/20] minor : typo --- include/ggml/ggml-backend.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/ggml/ggml-backend.h b/include/ggml/ggml-backend.h index 48d6c3f20..48d4daa01 100644 --- a/include/ggml/ggml-backend.h +++ b/include/ggml/ggml-backend.h @@ -78,7 +78,7 @@ extern "C" { // set tensor data from external pointer (shallow copy) // WARNING! It is the responsibility of the user to ensure that the provided pointer: // * is compatible with the backend (same address space) - // * points to a memory of the right sie and type/quantization as described by the tensor + // * points to a memory buffer of the right size and type/quantization as described by the tensor void (*set_tensor_external_data)(ggml_backend_t backend, struct ggml_tensor * tensor, void * data); // (optional) copy tensor between different backends, allow for single-copy tranfers From 77ba5bd978b14232dcd54026bc16f2f9de0b67e7 Mon Sep 17 00:00:00 2001 From: Borislav Stanimirov Date: Wed, 11 Oct 2023 10:15:09 +0300 Subject: [PATCH 11/20] cuda : only destroy handles on free if not a plugin --- src/ggml-cuda.cu | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/ggml-cuda.cu b/src/ggml-cuda.cu index f4ccabd26..cb770c125 100644 --- a/src/ggml-cuda.cu +++ b/src/ggml-cuda.cu @@ -464,6 +464,7 @@ inline cudaError_t ggml_cuda_set_device(const int device) { } static bool g_cublas_initialized = false; +static bool g_cublas_initialized_as_plugin = false; static int g_device_count = -1; static int g_main_device = 0; @@ -7540,16 +7541,21 @@ static void ggml_backend_cuda_free(ggml_backend_t backend) { for (int is = 0; is < MAX_STREAMS; ++is) { auto& stream = g_cudaStreams[id][is]; if (!stream) break; - cudaStreamDestroy(stream); + if (!g_cublas_initialized_as_plugin) { + cudaStreamDestroy(stream); + } stream = nullptr; } auto& cublasHandle = g_cublas_handles[id]; if (!cublasHandle) continue; - cublasDestroy(cublasHandle); + if (!g_cublas_initialized_as_plugin) { + cublasDestroy(cublasHandle); + } cublasHandle = nullptr; } g_cublas_initialized = false; + g_cublas_initialized_as_plugin = false; ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context; delete cuda_ctx; @@ -7832,6 +7838,7 @@ ggml_backend_t ggml_backend_cuda_init_plugin(int main_device, void * cublas_hand g_cudaStreams[id][0] = (cudaStream_t)cuda_stream; g_cublas_initialized = true; + g_cublas_initialized_as_plugin = true; ggml_backend_context_cuda* ctx = new ggml_backend_context_cuda; From ac52b0eb5f760f390ee35b7a008a0d0e6667043d Mon Sep 17 00:00:00 2001 From: Borislav Stanimirov Date: Wed, 11 Oct 2023 12:19:55 +0300 Subject: [PATCH 12/20] backends : dummy buffer for external tensors --- include/ggml/ggml-backend.h | 4 ++++ src/ggml-backend.c | 27 +++++++++++++++++++++++---- src/ggml-cuda.cu | 15 +++++++++++---- src/ggml-metal.m | 10 ++++++++-- 4 files changed, 46 insertions(+), 10 deletions(-) diff --git a/include/ggml/ggml-backend.h b/include/ggml/ggml-backend.h index 48d4daa01..67f047fd1 100644 --- a/include/ggml/ggml-backend.h +++ b/include/ggml/ggml-backend.h @@ -102,6 +102,8 @@ extern "C" { struct ggml_backend_i iface; ggml_backend_context_t context; + + struct ggml_backend_buffer dummy_external_tensor_buffer; }; // backend helper functions @@ -120,6 +122,8 @@ extern "C" { GGML_API void ggml_backend_tensor_set( struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); GGML_API void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); + GGML_API bool ggml_backend_is_tensor_external(const struct ggml_tensor* tensor); + GGML_API void ggml_backend_synchronize(ggml_backend_t backend); GGML_API void ggml_backend_set_tensor_external_data(ggml_backend_t backend, struct ggml_tensor * tensor, void * data); diff --git a/src/ggml-backend.c b/src/ggml-backend.c index ebcefe4ef..838c8874d 100644 --- a/src/ggml-backend.c +++ b/src/ggml-backend.c @@ -110,6 +110,10 @@ void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, siz ggml_get_backend(tensor)->iface.synchronize(ggml_get_backend(tensor)); } +bool ggml_backend_is_tensor_external(const struct ggml_tensor* tensor) { + return tensor->buffer == &ggml_get_backend(tensor)->dummy_external_tensor_buffer; +} + void ggml_backend_synchronize(ggml_backend_t backend) { backend->iface.synchronize(backend); } @@ -269,9 +273,13 @@ static void ggml_backend_cpu_synchronize(ggml_backend_t backend) { } static void ggml_backend_cpu_set_tensor_external_data(ggml_backend_t backend, struct ggml_tensor * tensor, void * data) { - GGML_ASSERT(tensor->buffer == NULL); + if (tensor->buffer) { + GGML_ASSERT(tensor->buffer == &backend->dummy_external_tensor_buffer); + } + else { + tensor->buffer = &backend->dummy_external_tensor_buffer; + } tensor->data = data; - UNUSED(backend); } static void ggml_backend_cpu_cpy_tensor_from(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst) { @@ -364,6 +372,16 @@ static struct ggml_backend_i cpu_backend_i = { /* .supports_op = */ ggml_backend_cpu_supports_op, }; +struct ggml_backend_buffer iggml_create_dummy_external_tensor_buffer(ggml_backend_t backend) { + struct ggml_backend_buffer ret = { + /* .interface = */ cpu_backend_buffer_i_from_ptr, + /* .backend = */ backend, + /* .context = */ NULL, + /* .size = */ 0, + }; + return ret; +} + ggml_backend_t ggml_backend_cpu_init(void) { struct ggml_backend_cpu_context * ctx = malloc(sizeof(struct ggml_backend_cpu_context)); @@ -373,9 +391,10 @@ ggml_backend_t ggml_backend_cpu_init(void) { ggml_backend_t cpu_backend = malloc(sizeof(struct ggml_backend)); - *cpu_backend = (struct ggml_backend) { + *cpu_backend = (struct ggml_backend){ /* .interface = */ cpu_backend_i, - /* .context = */ ctx + /* .context = */ ctx, + /* .dummy_external_tensor_buffer = */ iggml_create_dummy_external_tensor_buffer(cpu_backend) }; return cpu_backend; } diff --git a/src/ggml-cuda.cu b/src/ggml-cuda.cu index cb770c125..33227d95c 100644 --- a/src/ggml-cuda.cu +++ b/src/ggml-cuda.cu @@ -7700,14 +7700,16 @@ static void ggml_backend_cuda_synchronize(ggml_backend_t backend) { } static void ggml_backend_cuda_set_tensor_external_data(ggml_backend_t backend, struct ggml_tensor * tensor, void * data) { - GGML_ASSERT(tensor->buffer == NULL); - ggml_tensor_extra_gpu* extra = nullptr; if (tensor->extra) { + GGML_ASSERT(tensor->buffer == &backend->dummy_external_tensor_buffer); GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); extra = (ggml_tensor_extra_gpu *) tensor->extra; } else { + GGML_ASSERT(tensor->buffer == NULL); + tensor->buffer = &backend->dummy_external_tensor_buffer; + extra = ggml_cuda_alloc_temp_tensor_extra(); tensor->backend = GGML_BACKEND_GPU; tensor->extra = extra; @@ -7809,6 +7811,7 @@ static ggml_backend_i cuda_backend_i = { /* .supports_op = */ nullptr, }; +extern "C" struct ggml_backend_buffer iggml_create_dummy_external_tensor_buffer(ggml_backend_t backend); ggml_backend_t ggml_backend_cuda_init() { ggml_init_cublas(); // TODO: remove from ggml.c @@ -7816,8 +7819,10 @@ ggml_backend_t ggml_backend_cuda_init() { ggml_backend_t cuda_backend = new ggml_backend { /* .interface = */ cuda_backend_i, - /* .context = */ ctx + /* .context = */ ctx, + /* .dummy_external_tensor_buffer = */ nullptr }; + cuda_backend->dummy_external_tensor_buffer = iggml_create_dummy_external_tensor_buffer(cuda_backend); return cuda_backend; } @@ -7844,8 +7849,10 @@ ggml_backend_t ggml_backend_cuda_init_plugin(int main_device, void * cublas_hand ggml_backend_t cuda_backend = new ggml_backend { /* .interface = */ cuda_backend_i, - /* .context = */ ctx + /* .context = */ ctx, + /* .dummy_external_tensor_buffer = */ nullptr }; + cuda_backend->dummy_external_tensor_buffer = iggml_create_dummy_external_tensor_buffer(cuda_backend); return cuda_backend; } diff --git a/src/ggml-metal.m b/src/ggml-metal.m index 54fa43cff..d3132952d 100644 --- a/src/ggml-metal.m +++ b/src/ggml-metal.m @@ -1529,9 +1529,13 @@ static void ggml_backend_metal_synchronize(ggml_backend_t backend) { } static void ggml_backend_metal_set_tensor_external_data(ggml_backend_t backend, struct ggml_tensor * tensor, void * data) { - GGML_ASSERT(tensor->buffer == NULL); + if (tensor->buffer) { + GGML_ASSERT(tensor->buffer == &backend->dummy_external_tensor_buffer); + } + else { + tensor->buffer = &backend->dummy_external_tensor_buffer; + } tensor->data = data; - UNUSED(backend); } static void ggml_backend_metal_cpy_tensor_from(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst) { @@ -1576,6 +1580,7 @@ static bool ggml_backend_metal_supports_op(ggml_backend_t backend, const struct /* .supports_op = */ ggml_backend_metal_supports_op, }; +extern struct ggml_backend_buffer iggml_create_dummy_external_tensor_buffer(ggml_backend_t backend); ggml_backend_t ggml_backend_metal_init(void) { struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context)); @@ -1586,6 +1591,7 @@ ggml_backend_t ggml_backend_metal_init(void) { *metal_backend = (struct ggml_backend) { /* .interface = */ metal_backend_i, /* .context = */ ctx, + /* .dummy_external_tensor_buffer = */ iggml_create_dummy_external_tensor_buffer(metal_backend) }; return metal_backend; From e0334641d98a602c8f9d59da3768f9be3d7bf68a Mon Sep 17 00:00:00 2001 From: Borislav Stanimirov Date: Wed, 11 Oct 2023 15:19:48 +0300 Subject: [PATCH 13/20] backend : add offset arg to set_tensor_external_data --- include/ggml/ggml-backend.h | 4 ++-- src/ggml-backend.c | 8 ++++---- src/ggml-cuda.cu | 4 ++-- src/ggml-metal.m | 4 ++-- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/include/ggml/ggml-backend.h b/include/ggml/ggml-backend.h index 67f047fd1..2d464dcf0 100644 --- a/include/ggml/ggml-backend.h +++ b/include/ggml/ggml-backend.h @@ -79,7 +79,7 @@ extern "C" { // WARNING! It is the responsibility of the user to ensure that the provided pointer: // * is compatible with the backend (same address space) // * points to a memory buffer of the right size and type/quantization as described by the tensor - void (*set_tensor_external_data)(ggml_backend_t backend, struct ggml_tensor * tensor, void * data); + void (*set_tensor_external_data)(ggml_backend_t backend, struct ggml_tensor * tensor, void * data, size_t offset); // (optional) copy tensor between different backends, allow for single-copy tranfers void (*cpy_tensor_from)(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst); @@ -126,7 +126,7 @@ extern "C" { GGML_API void ggml_backend_synchronize(ggml_backend_t backend); - GGML_API void ggml_backend_set_tensor_external_data(ggml_backend_t backend, struct ggml_tensor * tensor, void * data); + GGML_API void ggml_backend_set_tensor_external_data(ggml_backend_t backend, struct ggml_tensor * tensor, void * data, size_t offset); GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create (ggml_backend_t backend, struct ggml_cgraph * cgraph); diff --git a/src/ggml-backend.c b/src/ggml-backend.c index 838c8874d..63fdb59f2 100644 --- a/src/ggml-backend.c +++ b/src/ggml-backend.c @@ -118,8 +118,8 @@ void ggml_backend_synchronize(ggml_backend_t backend) { backend->iface.synchronize(backend); } -void ggml_backend_set_tensor_external_data(ggml_backend_t backend, struct ggml_tensor * tensor, void * data) { - backend->iface.set_tensor_external_data(backend, tensor, data); +void ggml_backend_set_tensor_external_data(ggml_backend_t backend, struct ggml_tensor * tensor, void * data, size_t offset) { + backend->iface.set_tensor_external_data(backend, tensor, data, offset); } ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) { @@ -272,14 +272,14 @@ static void ggml_backend_cpu_synchronize(ggml_backend_t backend) { UNUSED(backend); } -static void ggml_backend_cpu_set_tensor_external_data(ggml_backend_t backend, struct ggml_tensor * tensor, void * data) { +static void ggml_backend_cpu_set_tensor_external_data(ggml_backend_t backend, struct ggml_tensor * tensor, void * data, size_t offset) { if (tensor->buffer) { GGML_ASSERT(tensor->buffer == &backend->dummy_external_tensor_buffer); } else { tensor->buffer = &backend->dummy_external_tensor_buffer; } - tensor->data = data; + tensor->data = (uint8_t *)data + offset; } static void ggml_backend_cpu_cpy_tensor_from(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst) { diff --git a/src/ggml-cuda.cu b/src/ggml-cuda.cu index 33227d95c..6ee2f8d0a 100644 --- a/src/ggml-cuda.cu +++ b/src/ggml-cuda.cu @@ -7699,7 +7699,7 @@ static void ggml_backend_cuda_synchronize(ggml_backend_t backend) { UNUSED(backend); } -static void ggml_backend_cuda_set_tensor_external_data(ggml_backend_t backend, struct ggml_tensor * tensor, void * data) { +static void ggml_backend_cuda_set_tensor_external_data(ggml_backend_t backend, struct ggml_tensor * tensor, void * data, size_t offset) { ggml_tensor_extra_gpu* extra = nullptr; if (tensor->extra) { GGML_ASSERT(tensor->buffer == &backend->dummy_external_tensor_buffer); @@ -7715,7 +7715,7 @@ static void ggml_backend_cuda_set_tensor_external_data(ggml_backend_t backend, s tensor->extra = extra; } - tensor->data = data; + tensor->data = (uint8_t *)data + offset; extra->data_device[g_main_device] = tensor->data; UNUSED(backend); diff --git a/src/ggml-metal.m b/src/ggml-metal.m index d3132952d..bd38dc326 100644 --- a/src/ggml-metal.m +++ b/src/ggml-metal.m @@ -1528,14 +1528,14 @@ static void ggml_backend_metal_synchronize(ggml_backend_t backend) { UNUSED(backend); } -static void ggml_backend_metal_set_tensor_external_data(ggml_backend_t backend, struct ggml_tensor * tensor, void * data) { +static void ggml_backend_metal_set_tensor_external_data(ggml_backend_t backend, struct ggml_tensor * tensor, void * data, size_t offset) { if (tensor->buffer) { GGML_ASSERT(tensor->buffer == &backend->dummy_external_tensor_buffer); } else { tensor->buffer = &backend->dummy_external_tensor_buffer; } - tensor->data = data; + tensor->data = (uint8_t *)data + offset; } static void ggml_backend_metal_cpy_tensor_from(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst) { From b29eb7f0bd361df240df353bba8b2d4125b94bb7 Mon Sep 17 00:00:00 2001 From: Borislav Stanimirov Date: Wed, 11 Oct 2023 15:32:07 +0300 Subject: [PATCH 14/20] minor : style --- src/ggml-cuda.cu | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/ggml-cuda.cu b/src/ggml-cuda.cu index 6ee2f8d0a..351881661 100644 --- a/src/ggml-cuda.cu +++ b/src/ggml-cuda.cu @@ -7812,6 +7812,7 @@ static ggml_backend_i cuda_backend_i = { }; extern "C" struct ggml_backend_buffer iggml_create_dummy_external_tensor_buffer(ggml_backend_t backend); + ggml_backend_t ggml_backend_cuda_init() { ggml_init_cublas(); // TODO: remove from ggml.c @@ -7820,7 +7821,7 @@ ggml_backend_t ggml_backend_cuda_init() { ggml_backend_t cuda_backend = new ggml_backend { /* .interface = */ cuda_backend_i, /* .context = */ ctx, - /* .dummy_external_tensor_buffer = */ nullptr + /* .dummy_external_tensor_buffer = */ {} }; cuda_backend->dummy_external_tensor_buffer = iggml_create_dummy_external_tensor_buffer(cuda_backend); @@ -7850,7 +7851,7 @@ ggml_backend_t ggml_backend_cuda_init_plugin(int main_device, void * cublas_hand ggml_backend_t cuda_backend = new ggml_backend { /* .interface = */ cuda_backend_i, /* .context = */ ctx, - /* .dummy_external_tensor_buffer = */ nullptr + /* .dummy_external_tensor_buffer = */ {} }; cuda_backend->dummy_external_tensor_buffer = iggml_create_dummy_external_tensor_buffer(cuda_backend); From 973c420c1c294aff62443d4276002af20e7750fa Mon Sep 17 00:00:00 2001 From: Borislav Stanimirov Date: Wed, 11 Oct 2023 16:34:10 +0300 Subject: [PATCH 15/20] wip --- src/ggml-cuda.cu | 29 +++++++++++++---------------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/src/ggml-cuda.cu b/src/ggml-cuda.cu index 351881661..418281298 100644 --- a/src/ggml-cuda.cu +++ b/src/ggml-cuda.cu @@ -7812,22 +7812,27 @@ static ggml_backend_i cuda_backend_i = { }; extern "C" struct ggml_backend_buffer iggml_create_dummy_external_tensor_buffer(ggml_backend_t backend); - -ggml_backend_t ggml_backend_cuda_init() { - ggml_init_cublas(); // TODO: remove from ggml.c - - ggml_backend_context_cuda * ctx = new ggml_backend_context_cuda; - - ggml_backend_t cuda_backend = new ggml_backend { +static ggml_backend_t create_cuda_backend(ggml_backend_context_cuda* ctx) { + ggml_backend_t cuda_backend = new ggml_backend{ /* .interface = */ cuda_backend_i, /* .context = */ ctx, /* .dummy_external_tensor_buffer = */ {} }; cuda_backend->dummy_external_tensor_buffer = iggml_create_dummy_external_tensor_buffer(cuda_backend); + //auto buf_ctx = new ggml_backend_buffer_context_cuda; + //buf_ctx->device = nullptr; + return cuda_backend; } +ggml_backend_t ggml_backend_cuda_init() { + ggml_init_cublas(); // TODO: remove from ggml.c + + ggml_backend_context_cuda * ctx = new ggml_backend_context_cuda; + return create_cuda_backend(ctx); +} + ggml_backend_t ggml_backend_cuda_init_plugin(int main_device, void * cublas_handle, void * cuda_stream) { GGML_ASSERT(g_cublas_initialized == false && "currently only a single cuda backend is supported"); @@ -7847,13 +7852,5 @@ ggml_backend_t ggml_backend_cuda_init_plugin(int main_device, void * cublas_hand g_cublas_initialized_as_plugin = true; ggml_backend_context_cuda* ctx = new ggml_backend_context_cuda; - - ggml_backend_t cuda_backend = new ggml_backend { - /* .interface = */ cuda_backend_i, - /* .context = */ ctx, - /* .dummy_external_tensor_buffer = */ {} - }; - cuda_backend->dummy_external_tensor_buffer = iggml_create_dummy_external_tensor_buffer(cuda_backend); - - return cuda_backend; + return create_cuda_backend(ctx); } From 837aa5f467054754ca47c813d49698836c5a354f Mon Sep 17 00:00:00 2001 From: Borislav Stanimirov Date: Thu, 12 Oct 2023 10:37:07 +0300 Subject: [PATCH 16/20] rewrite tensor external data --- include/ggml/ggml-alloc.h | 2 ++ include/ggml/ggml-backend.h | 12 ------------ src/ggml-alloc.c | 9 +++++++++ src/ggml-backend.c | 37 +++++------------------------------ src/ggml-cuda.cu | 39 +++++++------------------------------ src/ggml-metal.m | 13 ------------- 6 files changed, 23 insertions(+), 89 deletions(-) diff --git a/include/ggml/ggml-alloc.h b/include/ggml/ggml-alloc.h index e38758878..2bc70b338 100644 --- a/include/ggml/ggml-alloc.h +++ b/include/ggml/ggml-alloc.h @@ -23,6 +23,8 @@ GGML_API void ggml_allocr_alloc (struct ggml_allocr * alloc, struct ggml_ GGML_API size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph); GGML_API size_t ggml_allocr_max_size (struct ggml_allocr * alloc); +GGML_API void ggml_allocr_set_tensor_external_data(struct ggml_allocr * alloc, struct ggml_tensor * tensor, void * data, size_t data_offset); + GGML_API size_t ggml_allocr_alloc_graph_n( struct ggml_allocr * alloc, struct ggml_cgraph ** graphs, int n_graphs, diff --git a/include/ggml/ggml-backend.h b/include/ggml/ggml-backend.h index 2d464dcf0..da134b0db 100644 --- a/include/ggml/ggml-backend.h +++ b/include/ggml/ggml-backend.h @@ -75,12 +75,6 @@ extern "C" { void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); void (*synchronize) (ggml_backend_t backend); - // set tensor data from external pointer (shallow copy) - // WARNING! It is the responsibility of the user to ensure that the provided pointer: - // * is compatible with the backend (same address space) - // * points to a memory buffer of the right size and type/quantization as described by the tensor - void (*set_tensor_external_data)(ggml_backend_t backend, struct ggml_tensor * tensor, void * data, size_t offset); - // (optional) copy tensor between different backends, allow for single-copy tranfers void (*cpy_tensor_from)(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst); void (*cpy_tensor_to) (ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst); @@ -102,8 +96,6 @@ extern "C" { struct ggml_backend_i iface; ggml_backend_context_t context; - - struct ggml_backend_buffer dummy_external_tensor_buffer; }; // backend helper functions @@ -122,12 +114,8 @@ extern "C" { GGML_API void ggml_backend_tensor_set( struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); GGML_API void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); - GGML_API bool ggml_backend_is_tensor_external(const struct ggml_tensor* tensor); - GGML_API void ggml_backend_synchronize(ggml_backend_t backend); - GGML_API void ggml_backend_set_tensor_external_data(ggml_backend_t backend, struct ggml_tensor * tensor, void * data, size_t offset); - GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create (ggml_backend_t backend, struct ggml_cgraph * cgraph); GGML_API void ggml_backend_graph_plan_free (ggml_backend_t backend, ggml_backend_graph_plan_t plan); diff --git a/src/ggml-alloc.c b/src/ggml-alloc.c index 34eba3f83..41f021c10 100644 --- a/src/ggml-alloc.c +++ b/src/ggml-alloc.c @@ -183,6 +183,15 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) alloc->max_size = MAX(alloc->max_size, (char*)addr - (char*)alloc->data + size); } +void ggml_allocr_set_tensor_external_data(struct ggml_allocr* alloc, struct ggml_tensor* tensor, void* data, size_t data_offset) { + GGML_ASSERT(!ggml_is_view(tensor)); // views generally get data pointer from one of their sources + GGML_ASSERT(tensor->data == NULL); // avoid allocating tensor which already has memory allocated + GGML_ASSERT(data_offset == 0); // not supported yet + tensor->data = data; + tensor->buffer = alloc->buffer; + ggml_backend_buffer_init_tensor(alloc->buffer, tensor); +} + // this is a very naive implementation, but for our case the number of free blocks should be very small static void ggml_allocr_free_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) { if (ggml_allocr_is_own(alloc, tensor) == false) { diff --git a/src/ggml-backend.c b/src/ggml-backend.c index 63fdb59f2..f3da6cb4a 100644 --- a/src/ggml-backend.c +++ b/src/ggml-backend.c @@ -110,18 +110,10 @@ void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, siz ggml_get_backend(tensor)->iface.synchronize(ggml_get_backend(tensor)); } -bool ggml_backend_is_tensor_external(const struct ggml_tensor* tensor) { - return tensor->buffer == &ggml_get_backend(tensor)->dummy_external_tensor_buffer; -} - void ggml_backend_synchronize(ggml_backend_t backend) { backend->iface.synchronize(backend); } -void ggml_backend_set_tensor_external_data(ggml_backend_t backend, struct ggml_tensor * tensor, void * data, size_t offset) { - backend->iface.set_tensor_external_data(backend, tensor, data, offset); -} - ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) { return backend->iface.graph_plan_create(backend, cgraph); } @@ -239,8 +231,11 @@ static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = { static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512 static ggml_backend_buffer_t ggml_backend_cpu_alloc_buffer(ggml_backend_t backend, size_t size) { - size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned - void * data = malloc(size); // TODO: maybe use GGML_ALIGNED_MALLOC? + void * data = NULL; + if (size) { + size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned + data = malloc(size); // TODO: maybe use GGML_ALIGNED_MALLOC? + } return ggml_backend_buffer_init(backend, cpu_backend_buffer_i, data, size); } @@ -272,16 +267,6 @@ static void ggml_backend_cpu_synchronize(ggml_backend_t backend) { UNUSED(backend); } -static void ggml_backend_cpu_set_tensor_external_data(ggml_backend_t backend, struct ggml_tensor * tensor, void * data, size_t offset) { - if (tensor->buffer) { - GGML_ASSERT(tensor->buffer == &backend->dummy_external_tensor_buffer); - } - else { - tensor->buffer = &backend->dummy_external_tensor_buffer; - } - tensor->data = (uint8_t *)data + offset; -} - static void ggml_backend_cpu_cpy_tensor_from(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst) { ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src)); @@ -362,7 +347,6 @@ static struct ggml_backend_i cpu_backend_i = { /* .set_tensor_async = */ ggml_backend_cpu_set_tensor_async, /* .get_tensor_async = */ ggml_backend_cpu_get_tensor_async, /* .synchronize = */ ggml_backend_cpu_synchronize, - /* .set_tensor_external_data = */ ggml_backend_cpu_set_tensor_external_data, /* .cpy_tensor_from = */ ggml_backend_cpu_cpy_tensor_from, /* .cpy_tensor_to = */ ggml_backend_cpu_cpy_tensor_to, /* .graph_plan_create = */ ggml_backend_cpu_graph_plan_create, @@ -372,16 +356,6 @@ static struct ggml_backend_i cpu_backend_i = { /* .supports_op = */ ggml_backend_cpu_supports_op, }; -struct ggml_backend_buffer iggml_create_dummy_external_tensor_buffer(ggml_backend_t backend) { - struct ggml_backend_buffer ret = { - /* .interface = */ cpu_backend_buffer_i_from_ptr, - /* .backend = */ backend, - /* .context = */ NULL, - /* .size = */ 0, - }; - return ret; -} - ggml_backend_t ggml_backend_cpu_init(void) { struct ggml_backend_cpu_context * ctx = malloc(sizeof(struct ggml_backend_cpu_context)); @@ -394,7 +368,6 @@ ggml_backend_t ggml_backend_cpu_init(void) { *cpu_backend = (struct ggml_backend){ /* .interface = */ cpu_backend_i, /* .context = */ ctx, - /* .dummy_external_tensor_buffer = */ iggml_create_dummy_external_tensor_buffer(cpu_backend) }; return cpu_backend; } diff --git a/src/ggml-cuda.cu b/src/ggml-cuda.cu index 418281298..bfe4f8daa 100644 --- a/src/ggml-cuda.cu +++ b/src/ggml-cuda.cu @@ -7661,10 +7661,14 @@ static struct ggml_backend_buffer_i cuda_backend_buffer_interface = { }; static ggml_backend_buffer_t ggml_backend_cuda_alloc_buffer(ggml_backend_t backend, size_t size) { - ggml_cuda_set_device(g_main_device); - ggml_backend_buffer_context_cuda * ctx = new ggml_backend_buffer_context_cuda; - CUDA_CHECK(cudaMalloc(&ctx->device, size)); + if (size) { + ggml_cuda_set_device(g_main_device); + CUDA_CHECK(cudaMalloc(&ctx->device, size)); + } + else { + ctx->device = NULL; + } return ggml_backend_buffer_init(backend, cuda_backend_buffer_interface, ctx, size); } @@ -7699,28 +7703,6 @@ static void ggml_backend_cuda_synchronize(ggml_backend_t backend) { UNUSED(backend); } -static void ggml_backend_cuda_set_tensor_external_data(ggml_backend_t backend, struct ggml_tensor * tensor, void * data, size_t offset) { - ggml_tensor_extra_gpu* extra = nullptr; - if (tensor->extra) { - GGML_ASSERT(tensor->buffer == &backend->dummy_external_tensor_buffer); - GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); - extra = (ggml_tensor_extra_gpu *) tensor->extra; - } - else { - GGML_ASSERT(tensor->buffer == NULL); - tensor->buffer = &backend->dummy_external_tensor_buffer; - - extra = ggml_cuda_alloc_temp_tensor_extra(); - tensor->backend = GGML_BACKEND_GPU; - tensor->extra = extra; - } - - tensor->data = (uint8_t *)data + offset; - extra->data_device[g_main_device] = tensor->data; - - UNUSED(backend); -} - static ggml_backend_graph_plan_t ggml_backend_cuda_graph_plan_create(ggml_backend_t backend, ggml_cgraph * cgraph) { GGML_ASSERT(!"not implemented"); @@ -7801,7 +7783,6 @@ static ggml_backend_i cuda_backend_i = { /* .set_tensor_async = */ ggml_backend_cuda_set_tensor_async, /* .get_tensor_async = */ ggml_backend_cuda_get_tensor_async, /* .synchronize = */ ggml_backend_cuda_synchronize, - /* .set_tensor_external_data = */ ggml_backend_cuda_set_tensor_external_data, /* .cpy_tensor_from = */ nullptr, /* .cpy_tensor_to = */ nullptr, /* .graph_plan_create = */ ggml_backend_cuda_graph_plan_create, @@ -7811,17 +7792,11 @@ static ggml_backend_i cuda_backend_i = { /* .supports_op = */ nullptr, }; -extern "C" struct ggml_backend_buffer iggml_create_dummy_external_tensor_buffer(ggml_backend_t backend); static ggml_backend_t create_cuda_backend(ggml_backend_context_cuda* ctx) { ggml_backend_t cuda_backend = new ggml_backend{ /* .interface = */ cuda_backend_i, /* .context = */ ctx, - /* .dummy_external_tensor_buffer = */ {} }; - cuda_backend->dummy_external_tensor_buffer = iggml_create_dummy_external_tensor_buffer(cuda_backend); - - //auto buf_ctx = new ggml_backend_buffer_context_cuda; - //buf_ctx->device = nullptr; return cuda_backend; } diff --git a/src/ggml-metal.m b/src/ggml-metal.m index bd38dc326..29cb3c922 100644 --- a/src/ggml-metal.m +++ b/src/ggml-metal.m @@ -1528,16 +1528,6 @@ static void ggml_backend_metal_synchronize(ggml_backend_t backend) { UNUSED(backend); } -static void ggml_backend_metal_set_tensor_external_data(ggml_backend_t backend, struct ggml_tensor * tensor, void * data, size_t offset) { - if (tensor->buffer) { - GGML_ASSERT(tensor->buffer == &backend->dummy_external_tensor_buffer); - } - else { - tensor->buffer = &backend->dummy_external_tensor_buffer; - } - tensor->data = (uint8_t *)data + offset; -} - static void ggml_backend_metal_cpy_tensor_from(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst) { ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src)); @@ -1570,7 +1560,6 @@ static bool ggml_backend_metal_supports_op(ggml_backend_t backend, const struct /* .set_tensor_async = */ ggml_backend_metal_set_tensor_async, /* .get_tensor_async = */ ggml_backend_metal_get_tensor_async, /* .synchronize = */ ggml_backend_metal_synchronize, - /* .set_tensor_external_data = */ ggml_backend_metal_set_tensor_external_data, /* .cpy_tensor_from = */ ggml_backend_metal_cpy_tensor_from, /* .cpy_tensor_to = */ ggml_backend_metal_cpy_tensor_to, /* .graph_plan_create = */ NULL, // the metal implementation does not require creating graph plans atm @@ -1580,7 +1569,6 @@ static bool ggml_backend_metal_supports_op(ggml_backend_t backend, const struct /* .supports_op = */ ggml_backend_metal_supports_op, }; -extern struct ggml_backend_buffer iggml_create_dummy_external_tensor_buffer(ggml_backend_t backend); ggml_backend_t ggml_backend_metal_init(void) { struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context)); @@ -1591,7 +1579,6 @@ ggml_backend_t ggml_backend_metal_init(void) { *metal_backend = (struct ggml_backend) { /* .interface = */ metal_backend_i, /* .context = */ ctx, - /* .dummy_external_tensor_buffer = */ iggml_create_dummy_external_tensor_buffer(metal_backend) }; return metal_backend; From 3ee1e60a9111769b56a21e040bafbd7a154ea30c Mon Sep 17 00:00:00 2001 From: Borislav Stanimirov Date: Thu, 12 Oct 2023 10:57:57 +0300 Subject: [PATCH 17/20] plugin example --- examples/CMakeLists.txt | 1 + examples/plugin/CMakeLists.txt | 10 +++++ examples/plugin/README.md | 5 +++ examples/plugin/cpu-plugin.cpp | 42 ++++++++++++++++++++ examples/plugin/cuda-plugin.cpp | 67 ++++++++++++++++++++++++++++++++ examples/plugin/model.cpp | 68 +++++++++++++++++++++++++++++++++ examples/plugin/model.hpp | 37 ++++++++++++++++++ 7 files changed, 230 insertions(+) create mode 100644 examples/plugin/CMakeLists.txt create mode 100644 examples/plugin/README.md create mode 100644 examples/plugin/cpu-plugin.cpp create mode 100644 examples/plugin/cuda-plugin.cpp create mode 100644 examples/plugin/model.cpp create mode 100644 examples/plugin/model.hpp diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index e3404fb8b..b569ecd64 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -28,3 +28,4 @@ add_subdirectory(replit) add_subdirectory(mpt) add_subdirectory(starcoder) add_subdirectory(sam) +add_subdirectory(plugin) diff --git a/examples/plugin/CMakeLists.txt b/examples/plugin/CMakeLists.txt new file mode 100644 index 000000000..aac1c62ad --- /dev/null +++ b/examples/plugin/CMakeLists.txt @@ -0,0 +1,10 @@ +add_library(plugin-model STATIC model.cpp) +target_link_libraries(plugin-model PUBLIC ggml::ggml) + +add_executable(cpu-plugin cpu-plugin.cpp) +target_link_libraries(cpu-plugin plugin-model) + +if (GGML_CUBLAS) + add_executable(cuda-plugin cuda-plugin.cpp) + target_link_libraries(cuda-plugin plugin-model) +endif() diff --git a/examples/plugin/README.md b/examples/plugin/README.md new file mode 100644 index 000000000..b75db63d4 --- /dev/null +++ b/examples/plugin/README.md @@ -0,0 +1,5 @@ +# GGML Plugin + +This example showcases the use of GGML as a plugin. + +The executables demonstrate how to initialize a backend and run inference with a model whose data comes from the outside. diff --git a/examples/plugin/cpu-plugin.cpp b/examples/plugin/cpu-plugin.cpp new file mode 100644 index 000000000..2e6db2d3b --- /dev/null +++ b/examples/plugin/cpu-plugin.cpp @@ -0,0 +1,42 @@ +#include "model.hpp" + +#include +#include + +#include +#include + +int main() { + auto backend = ggml_backend_cpu_init(); + + std::vector weights_data; + for (int i = 0; i < 10; ++i) { + weights_data.push_back(float(i)); + } + + void* weights = weights_data.data(); + + model m(backend, weights_data.size(), GGML_TYPE_F32, weights); + + std::vector input_data; + for (size_t i = 0; i < weights_data.size(); ++i) { + input_data.push_back(float(i) / 10); + } + + std::vector output_data(input_data.size()); + + void* input = input_data.data(); + void* output = output_data.data(); + + m.compute(output, input); + + ggml_backend_free(backend); + + std::cout << "["; + for (auto o : output_data) { + std::cout << o << ", "; + } + std::cout << "]\n"; + + return 0; +} diff --git a/examples/plugin/cuda-plugin.cpp b/examples/plugin/cuda-plugin.cpp new file mode 100644 index 000000000..8f278a57c --- /dev/null +++ b/examples/plugin/cuda-plugin.cpp @@ -0,0 +1,67 @@ +#include "model.hpp" + +#include +#include + +#include +#include +#include + +int main() { + // init cuda + int device_id = 0; + cudaSetDevice(device_id); + cublasHandle_t cublas_handle = nullptr; + cublasCreate(&cublas_handle); + cudaStream_t cuda_stream = nullptr; + cudaStreamCreateWithFlags(&cuda_stream, cudaStreamNonBlocking); + + // create plugin backend + auto backend = ggml_backend_cuda_init_plugin(device_id, cublas_handle, cuda_stream); + + // init weights + std::vector weights_data; + for (int i = 0; i < 10; ++i) { + weights_data.push_back(float(i)); + } + + void* weights = nullptr; + cudaMallocAsync(&weights, data_size(weights_data), cuda_stream); + cudaMemcpyAsync(weights, weights_data.data(), data_size(weights_data), cudaMemcpyHostToDevice, cuda_stream); + + // create model with weights + model m(backend, weights_data.size(), GGML_TYPE_F32, weights); + + // init input and output data + std::vector input_data; + for (size_t i = 0; i < weights_data.size(); ++i) { + input_data.push_back(float(i) / 10); + } + + std::vector output_data(input_data.size()); + + void* input = nullptr; + cudaMallocAsync(&input, data_size(input_data), cuda_stream); + cudaMemcpyAsync(input, input_data.data(), data_size(input_data), cudaMemcpyHostToDevice, cuda_stream); + + void* output = nullptr; + cudaMallocAsync(&output, data_size(output_data), cuda_stream); + + // compute with cuda pointers + m.compute(output, input); + + // get data back from cuda pointers + cudaMemcpyAsync(output_data.data(), output, data_size(output_data), cudaMemcpyDeviceToHost, cuda_stream); + cudaStreamSynchronize(cuda_stream); + + ggml_backend_free(backend); + + // print result + std::cout << "["; + for (auto o : output_data) { + std::cout << o << ", "; + } + std::cout << "]\n"; + + return 0; +} diff --git a/examples/plugin/model.cpp b/examples/plugin/model.cpp new file mode 100644 index 000000000..53583aa4c --- /dev/null +++ b/examples/plugin/model.cpp @@ -0,0 +1,68 @@ +#include "model.hpp" + +#include +#include +#include + +#include + +model::model(ggml_backend_t be, int64_t s, ggml_type t, void* weights_data) + : backend(be) + , size(s) + , type(t) +{ + assert(weights_data); + static constexpr size_t numWeightTensors = sizeof(weights_t) / sizeof(ggml_tensor*); + wctx = ggml_init({ + /*.mem_size =*/ ggml_tensor_overhead() * numWeightTensors, + /*.mem_buffer =*/ nullptr, + /*.no_alloc =*/ true, + }); + weights.w = ggml_new_tensor_1d(wctx, type, size); + wbuf = ggml_backend_alloc_buffer(backend, 0); + auto wallocr = ggml_allocr_new_from_buffer(wbuf); + ggml_allocr_set_tensor_external_data(wallocr, weights.w, weights_data, 0); + ggml_allocr_free(wallocr); + + cbuf = ggml_backend_alloc_buffer(backend, 0); + callocr = ggml_allocr_new_from_buffer(cbuf); +} + +model::~model() { + ggml_free(wctx); + ggml_backend_buffer_free(wbuf); + ggml_allocr_free(callocr); + ggml_backend_buffer_free(cbuf); +} + +struct io_tensors { + ggml_tensor* input = nullptr; + ggml_tensor* output = nullptr; +}; + +void model::compute(void* output, void* input) { + assert(input); + assert(output); + + static constexpr size_t num_io_tensors = sizeof(io_tensors) / sizeof(ggml_tensor*); + auto cctx = ggml_init({ + /*.mem_size =*/ ggml_tensor_overhead() * num_io_tensors + ggml_graph_overhead(), + /*.mem_buffer =*/ nullptr, + /*.no_alloc =*/ true, + }); + + io_tensors io = {}; + io.input = ggml_new_tensor_1d(cctx, type, size); + io.output = ggml_add(cctx, io.input, weights.w); + + ggml_allocr_set_tensor_external_data(callocr, io.input, input, 0); + ggml_allocr_set_tensor_external_data(callocr, io.output, output, 0); + + auto graph = ggml_new_graph(cctx); + ggml_build_forward_expand(graph, io.output); + + ggml_backend_graph_compute(backend, graph); + + ggml_allocr_reset(callocr); + ggml_free(cctx); +} diff --git a/examples/plugin/model.hpp b/examples/plugin/model.hpp new file mode 100644 index 000000000..6b251c774 --- /dev/null +++ b/examples/plugin/model.hpp @@ -0,0 +1,37 @@ +#pragma once +#include + +struct ggml_tensor; +typedef struct ggml_backend* ggml_backend_t; +struct ggml_context; +enum ggml_type; +struct ggml_backend_buffer; +struct ggml_allocr; + +struct model { + struct weights_t { + ggml_tensor* w = nullptr; + } weights; + + ggml_backend_t backend = nullptr; + + ggml_context* wctx = nullptr; + ggml_backend_buffer* wbuf = nullptr; // weights buffer + + ggml_backend_buffer* cbuf = nullptr; // compute buffer + ggml_allocr* callocr = nullptr; // compute allocator + + const int64_t size; + const ggml_type type; + + model(ggml_backend_t be, int64_t s, ggml_type t, void* weights_data); + ~model(); + + void compute(void* output, void* input); +}; + +// util +template +size_t data_size(const Vec& vec) { + return vec.size() * sizeof(typename Vec::value_type); +} From a8b9f6a0ca0e4e8ee51c2dbda372547f037d991b Mon Sep 17 00:00:00 2001 From: Borislav Stanimirov Date: Thu, 12 Oct 2023 11:48:05 +0300 Subject: [PATCH 18/20] remove non-standard enum fwd decl --- examples/plugin/cpu-plugin.cpp | 1 - examples/plugin/model.cpp | 1 - examples/plugin/model.hpp | 4 +--- 3 files changed, 1 insertion(+), 5 deletions(-) diff --git a/examples/plugin/cpu-plugin.cpp b/examples/plugin/cpu-plugin.cpp index 2e6db2d3b..548aadecf 100644 --- a/examples/plugin/cpu-plugin.cpp +++ b/examples/plugin/cpu-plugin.cpp @@ -1,6 +1,5 @@ #include "model.hpp" -#include #include #include diff --git a/examples/plugin/model.cpp b/examples/plugin/model.cpp index 53583aa4c..8dde16c40 100644 --- a/examples/plugin/model.cpp +++ b/examples/plugin/model.cpp @@ -1,6 +1,5 @@ #include "model.hpp" -#include #include #include diff --git a/examples/plugin/model.hpp b/examples/plugin/model.hpp index 6b251c774..fee63d9c0 100644 --- a/examples/plugin/model.hpp +++ b/examples/plugin/model.hpp @@ -1,10 +1,8 @@ #pragma once +#include #include -struct ggml_tensor; typedef struct ggml_backend* ggml_backend_t; -struct ggml_context; -enum ggml_type; struct ggml_backend_buffer; struct ggml_allocr; From 933b1329c992565a3a40e71b50088de7efbbc542 Mon Sep 17 00:00:00 2001 From: Borislav Stanimirov Date: Thu, 12 Oct 2023 14:12:51 +0300 Subject: [PATCH 19/20] minor : document set_tensor_external_data --- include/ggml/ggml-alloc.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/include/ggml/ggml-alloc.h b/include/ggml/ggml-alloc.h index 2bc70b338..9f9c9a8b9 100644 --- a/include/ggml/ggml-alloc.h +++ b/include/ggml/ggml-alloc.h @@ -23,6 +23,11 @@ GGML_API void ggml_allocr_alloc (struct ggml_allocr * alloc, struct ggml_ GGML_API size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph); GGML_API size_t ggml_allocr_max_size (struct ggml_allocr * alloc); +// set tensor data from external pointer (shallow copy) +// WARNING! It is the responsibility of the user to ensure that the provided pointer: +// * is compatible with the buffer backend (same address space) +// * points to memory of the right size and type/quantization as described by the tensor +// * remains valid while the associated tensor is used GGML_API void ggml_allocr_set_tensor_external_data(struct ggml_allocr * alloc, struct ggml_tensor * tensor, void * data, size_t data_offset); GGML_API size_t ggml_allocr_alloc_graph_n( From 6ced18f1718ca97355d6242366d18be88ff9aae5 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 12 Oct 2023 16:18:56 +0300 Subject: [PATCH 20/20] minor : spaces --- src/ggml-alloc.c | 2 +- src/ggml-backend.c | 2 +- src/ggml-cuda.cu | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/ggml-alloc.c b/src/ggml-alloc.c index 41f021c10..5da722d1a 100644 --- a/src/ggml-alloc.c +++ b/src/ggml-alloc.c @@ -183,7 +183,7 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) alloc->max_size = MAX(alloc->max_size, (char*)addr - (char*)alloc->data + size); } -void ggml_allocr_set_tensor_external_data(struct ggml_allocr* alloc, struct ggml_tensor* tensor, void* data, size_t data_offset) { +void ggml_allocr_set_tensor_external_data(struct ggml_allocr * alloc, struct ggml_tensor * tensor, void * data, size_t data_offset) { GGML_ASSERT(!ggml_is_view(tensor)); // views generally get data pointer from one of their sources GGML_ASSERT(tensor->data == NULL); // avoid allocating tensor which already has memory allocated GGML_ASSERT(data_offset == 0); // not supported yet diff --git a/src/ggml-backend.c b/src/ggml-backend.c index f3da6cb4a..45cf7cfa1 100644 --- a/src/ggml-backend.c +++ b/src/ggml-backend.c @@ -365,7 +365,7 @@ ggml_backend_t ggml_backend_cpu_init(void) { ggml_backend_t cpu_backend = malloc(sizeof(struct ggml_backend)); - *cpu_backend = (struct ggml_backend){ + *cpu_backend = (struct ggml_backend) { /* .interface = */ cpu_backend_i, /* .context = */ ctx, }; diff --git a/src/ggml-cuda.cu b/src/ggml-cuda.cu index bfe4f8daa..507250f9f 100644 --- a/src/ggml-cuda.cu +++ b/src/ggml-cuda.cu @@ -7539,7 +7539,7 @@ static const char * ggml_backend_cuda_name(ggml_backend_t backend) { static void ggml_backend_cuda_free(ggml_backend_t backend) { for (int id = 0; id < GGML_CUDA_MAX_DEVICES; ++id) { for (int is = 0; is < MAX_STREAMS; ++is) { - auto& stream = g_cudaStreams[id][is]; + auto & stream = g_cudaStreams[id][is]; if (!stream) break; if (!g_cublas_initialized_as_plugin) { cudaStreamDestroy(stream); @@ -7547,7 +7547,7 @@ static void ggml_backend_cuda_free(ggml_backend_t backend) { stream = nullptr; } - auto& cublasHandle = g_cublas_handles[id]; + auto & cublasHandle = g_cublas_handles[id]; if (!cublasHandle) continue; if (!g_cublas_initialized_as_plugin) { cublasDestroy(cublasHandle);