ggml-org · iboB · Oct 10, 2023 · Oct 10, 2023 · Oct 10, 2023 · Oct 10, 2023
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
@@ -28,3 +28,4 @@ add_subdirectory(replit)
 add_subdirectory(mpt)
 add_subdirectory(starcoder)
 add_subdirectory(sam)
+add_subdirectory(plugin)
diff --git a/examples/plugin/CMakeLists.txt b/examples/plugin/CMakeLists.txt
@@ -0,0 +1,10 @@
+add_library(plugin-model STATIC model.cpp)
+target_link_libraries(plugin-model PUBLIC ggml::ggml)
+
+add_executable(cpu-plugin cpu-plugin.cpp)
+target_link_libraries(cpu-plugin plugin-model)
+
+if (GGML_CUBLAS)
+    add_executable(cuda-plugin cuda-plugin.cpp)
+    target_link_libraries(cuda-plugin plugin-model)
+endif()
diff --git a/examples/plugin/README.md b/examples/plugin/README.md
@@ -0,0 +1,5 @@
+# GGML Plugin
+
+This example showcases the use of GGML as a plugin.
+
+The executables demonstrate how to initialize a backend and run inference with a model whose data comes from the outside.
diff --git a/examples/plugin/cpu-plugin.cpp b/examples/plugin/cpu-plugin.cpp
@@ -0,0 +1,41 @@
+#include "model.hpp"
+
+#include <ggml-backend.h>
+
+#include <vector>
+#include <iostream>
+
+int main() {
+    auto backend = ggml_backend_cpu_init();
+
+    std::vector<float> weights_data;
+    for (int i = 0; i < 10; ++i) {
+        weights_data.push_back(float(i));
+    }
+
+    void* weights = weights_data.data();
+
+    model m(backend, weights_data.size(), GGML_TYPE_F32, weights);
+
+    std::vector<float> input_data;
+    for (size_t i = 0; i < weights_data.size(); ++i) {
+        input_data.push_back(float(i) / 10);
+    }
+
+    std::vector<float> output_data(input_data.size());
+
+    void* input = input_data.data();
+    void* output = output_data.data();
+
+    m.compute(output, input);
+
+    ggml_backend_free(backend);
+
+    std::cout << "[";
+    for (auto o : output_data) {
+        std::cout << o << ", ";
+    }
+    std::cout << "]\n";
+
+    return 0;
+}
diff --git a/examples/plugin/cuda-plugin.cpp b/examples/plugin/cuda-plugin.cpp
@@ -0,0 +1,67 @@
+#include "model.hpp"
+
+#include <vector>
+#include <iostream>
+
+#include <ggml-cuda.h>
+#include <cuda_runtime.h>
+#include <cublas_v2.h>
+
+int main() {
+    // init cuda
+    int device_id = 0;
+    cudaSetDevice(device_id);
+    cublasHandle_t cublas_handle = nullptr;
+    cublasCreate(&cublas_handle);
+    cudaStream_t cuda_stream = nullptr;
+    cudaStreamCreateWithFlags(&cuda_stream, cudaStreamNonBlocking);
+
+    // create plugin backend
+    auto backend = ggml_backend_cuda_init_plugin(device_id, cublas_handle, cuda_stream);
+
+    // init weights
+    std::vector<float> weights_data;
+    for (int i = 0; i < 10; ++i) {
+        weights_data.push_back(float(i));
+    }
+
+    void* weights = nullptr;
+    cudaMallocAsync(&weights, data_size(weights_data), cuda_stream);
+    cudaMemcpyAsync(weights, weights_data.data(), data_size(weights_data), cudaMemcpyHostToDevice, cuda_stream);
+
+    // create model with weights
+    model m(backend, weights_data.size(), GGML_TYPE_F32, weights);
+
+    // init input and output data
+    std::vector<float> input_data;
+    for (size_t i = 0; i < weights_data.size(); ++i) {
+        input_data.push_back(float(i) / 10);
+    }
+
+    std::vector<float> output_data(input_data.size());
+
+    void* input = nullptr;
+    cudaMallocAsync(&input, data_size(input_data), cuda_stream);
+    cudaMemcpyAsync(input, input_data.data(), data_size(input_data), cudaMemcpyHostToDevice, cuda_stream);
+
+    void* output = nullptr;
+    cudaMallocAsync(&output, data_size(output_data), cuda_stream);
+
+    // compute with cuda pointers
+    m.compute(output, input);
+
+    // get data back from cuda pointers
+    cudaMemcpyAsync(output_data.data(), output, data_size(output_data), cudaMemcpyDeviceToHost, cuda_stream);
+    cudaStreamSynchronize(cuda_stream);
+
+    ggml_backend_free(backend);
+
+    // print result
+    std::cout << "[";
+    for (auto o : output_data) {
+        std::cout << o << ", ";
+    }
+    std::cout << "]\n";
+
+    return 0;
+}
diff --git a/examples/plugin/model.cpp b/examples/plugin/model.cpp
@@ -0,0 +1,67 @@
+#include "model.hpp"
+
+#include <ggml-alloc.h>
+#include <ggml-backend.h>
+
+#include <cassert>
+
+model::model(ggml_backend_t be, int64_t s, ggml_type t, void* weights_data)
+    : backend(be)
+    , size(s)
+    , type(t)
+{
+    assert(weights_data);
+    static constexpr size_t numWeightTensors = sizeof(weights_t) / sizeof(ggml_tensor*);
+    wctx = ggml_init({
+        /*.mem_size   =*/ ggml_tensor_overhead() * numWeightTensors,
+        /*.mem_buffer =*/ nullptr,
+        /*.no_alloc   =*/ true,
+        });
+    weights.w = ggml_new_tensor_1d(wctx, type, size);
+    wbuf = ggml_backend_alloc_buffer(backend, 0);
+    auto wallocr = ggml_allocr_new_from_buffer(wbuf);
+    ggml_allocr_set_tensor_external_data(wallocr, weights.w, weights_data, 0);
+    ggml_allocr_free(wallocr);
+
+    cbuf = ggml_backend_alloc_buffer(backend, 0);
+    callocr = ggml_allocr_new_from_buffer(cbuf);
+}
+
+model::~model() {
+    ggml_free(wctx);
+    ggml_backend_buffer_free(wbuf);
+    ggml_allocr_free(callocr);
+    ggml_backend_buffer_free(cbuf);
+}
+
+struct io_tensors {
+    ggml_tensor* input = nullptr;
+    ggml_tensor* output = nullptr;
+};
+
+void model::compute(void* output, void* input) {
+    assert(input);
+    assert(output);
+
+    static constexpr size_t num_io_tensors = sizeof(io_tensors) / sizeof(ggml_tensor*);
+    auto cctx = ggml_init({
+        /*.mem_size   =*/ ggml_tensor_overhead() * num_io_tensors + ggml_graph_overhead(),
+        /*.mem_buffer =*/ nullptr,
+        /*.no_alloc   =*/ true,
+        });
+
+    io_tensors io = {};
+    io.input = ggml_new_tensor_1d(cctx, type, size);
+    io.output = ggml_add(cctx, io.input, weights.w);
+
+    ggml_allocr_set_tensor_external_data(callocr, io.input, input, 0);
+    ggml_allocr_set_tensor_external_data(callocr, io.output, output, 0);
+
+    auto graph = ggml_new_graph(cctx);
+    ggml_build_forward_expand(graph, io.output);
+
+    ggml_backend_graph_compute(backend, graph);
+
+    ggml_allocr_reset(callocr);
+    ggml_free(cctx);
+}
diff --git a/examples/plugin/model.hpp b/examples/plugin/model.hpp
@@ -0,0 +1,35 @@
+#pragma once
+#include <ggml.h>
+#include <cstdint>
+
+typedef struct ggml_backend* ggml_backend_t;
+struct ggml_backend_buffer;
+struct ggml_allocr;
+
+struct model {
+    struct weights_t {
+        ggml_tensor* w = nullptr;
+    } weights;
+
+    ggml_backend_t backend = nullptr;
+
+    ggml_context* wctx = nullptr;
+    ggml_backend_buffer* wbuf = nullptr; // weights buffer
+
+    ggml_backend_buffer* cbuf = nullptr; // compute buffer
+    ggml_allocr* callocr = nullptr; // compute allocator
+
+    const int64_t size;
+    const ggml_type type;
+
+    model(ggml_backend_t be, int64_t s, ggml_type t, void* weights_data);
+    ~model();
+
+    void compute(void* output, void* input);
+};
+
+// util
+template <typename Vec>
+size_t data_size(const Vec& vec) {
+    return vec.size() * sizeof(typename Vec::value_type);
+}
diff --git a/include/ggml/ggml-alloc.h b/include/ggml/ggml-alloc.h
@@ -23,6 +23,13 @@ GGML_API void   ggml_allocr_alloc      (struct ggml_allocr * alloc, struct ggml_
 GGML_API size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph);
 GGML_API size_t ggml_allocr_max_size   (struct ggml_allocr * alloc);
 
+// set tensor data from external pointer (shallow copy)
+// WARNING! It is the responsibility of the user to ensure that the provided pointer:
+// * is compatible with the buffer backend (same address space)
+// * points to memory of the right size and type/quantization as described by the tensor
+// * remains valid while the associated tensor is used
+GGML_API void   ggml_allocr_set_tensor_external_data(struct ggml_allocr * alloc, struct ggml_tensor * tensor, void * data, size_t data_offset);
+
 GGML_API size_t ggml_allocr_alloc_graph_n(
                     struct ggml_allocr * alloc,
                     struct ggml_cgraph ** graphs, int n_graphs,

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -249,7 +249,12 @@ if (GGML_PERF)
     set(GGML_EXTRA_FLAGS ${GGML_EXTRA_FLAGS} -DGGML_PERF)
 endif()
 
-add_library(${TARGET}
+if (GGML_PLUGIN)
+    set(GGML_LIB_TYPE STATIC)
+    set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+endif()
+
+add_library(${TARGET} ${GGML_LIB_TYPE}
     ggml.c
     ggml-alloc.c
     ggml-backend.c
@@ -261,6 +266,8 @@ add_library(${TARGET}
     ${GGML_METAL_SOURCES}
     )
 
+add_library(ggml::ggml ALIAS ggml)
+
 target_include_directories(${TARGET} PUBLIC
     .
     ../include
@@ -274,7 +281,7 @@ else()
     target_link_libraries(${TARGET} PUBLIC m ${GGML_EXTRA_LIBS} ${CMAKE_THREAD_LIBS_INIT})
 endif()
 
-if (BUILD_SHARED_LIBS)
+if (BUILD_SHARED_LIBS AND NOT GGML_PLUGIN)
     set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
 
     target_link_libraries(${TARGET} PUBLIC

diff --git a/src/ggml-alloc.c b/src/ggml-alloc.c
@@ -183,6 +183,15 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
     alloc->max_size = MAX(alloc->max_size, (char*)addr - (char*)alloc->data + size);
 }
 
+void ggml_allocr_set_tensor_external_data(struct ggml_allocr * alloc, struct ggml_tensor * tensor, void * data, size_t data_offset) {
+    GGML_ASSERT(!ggml_is_view(tensor)); // views generally get data pointer from one of their sources
+    GGML_ASSERT(tensor->data == NULL); // avoid allocating tensor which already has memory allocated
+    GGML_ASSERT(data_offset == 0); // not supported yet
+    tensor->data = data;
+    tensor->buffer = alloc->buffer;
+    ggml_backend_buffer_init_tensor(alloc->buffer, tensor);
+}
+
 // this is a very naive implementation, but for our case the number of free blocks should be very small
 static void ggml_allocr_free_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
     if (ggml_allocr_is_own(alloc, tensor) == false) {

diff --git a/src/ggml-backend.c b/src/ggml-backend.c
@@ -231,8 +231,11 @@ static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = {
 static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512
 
 static ggml_backend_buffer_t ggml_backend_cpu_alloc_buffer(ggml_backend_t backend, size_t size) {
-    size += TENSOR_ALIGNMENT;   // malloc may return an address that is not aligned
-    void * data = malloc(size); // TODO: maybe use GGML_ALIGNED_MALLOC?
+    void * data = NULL;
+    if (size) {
+        size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned
+        data = malloc(size);      // TODO: maybe use GGML_ALIGNED_MALLOC?
+    }
 
     return ggml_backend_buffer_init(backend, cpu_backend_buffer_i, data, size);
 }
@@ -364,7 +367,7 @@ ggml_backend_t ggml_backend_cpu_init(void) {
 
     *cpu_backend = (struct ggml_backend) {
         /* .interface = */ cpu_backend_i,
-        /* .context   = */ ctx
+        /* .context   = */ ctx,
     };
     return cpu_backend;
 }