Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Plugin interface for backends #570

Open
wants to merge 20 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions examples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -28,3 +28,4 @@ add_subdirectory(replit)
add_subdirectory(mpt)
add_subdirectory(starcoder)
add_subdirectory(sam)
add_subdirectory(plugin)
10 changes: 10 additions & 0 deletions examples/plugin/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
add_library(plugin-model STATIC model.cpp)
target_link_libraries(plugin-model PUBLIC ggml::ggml)

add_executable(cpu-plugin cpu-plugin.cpp)
target_link_libraries(cpu-plugin plugin-model)

if (GGML_CUBLAS)
add_executable(cuda-plugin cuda-plugin.cpp)
target_link_libraries(cuda-plugin plugin-model)
endif()
5 changes: 5 additions & 0 deletions examples/plugin/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# GGML Plugin

This example showcases the use of GGML as a plugin.

The executables demonstrate how to initialize a backend and run inference with a model whose data comes from the outside.
41 changes: 41 additions & 0 deletions examples/plugin/cpu-plugin.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
#include "model.hpp"

#include <ggml-backend.h>

#include <vector>
#include <iostream>

int main() {
auto backend = ggml_backend_cpu_init();

std::vector<float> weights_data;
for (int i = 0; i < 10; ++i) {
weights_data.push_back(float(i));
}

void* weights = weights_data.data();

model m(backend, weights_data.size(), GGML_TYPE_F32, weights);

std::vector<float> input_data;
for (size_t i = 0; i < weights_data.size(); ++i) {
input_data.push_back(float(i) / 10);
}

std::vector<float> output_data(input_data.size());

void* input = input_data.data();
void* output = output_data.data();

m.compute(output, input);

ggml_backend_free(backend);

std::cout << "[";
for (auto o : output_data) {
std::cout << o << ", ";
}
std::cout << "]\n";

return 0;
}
67 changes: 67 additions & 0 deletions examples/plugin/cuda-plugin.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
#include "model.hpp"

#include <vector>
#include <iostream>

#include <ggml-cuda.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>

int main() {
// init cuda
int device_id = 0;
cudaSetDevice(device_id);
cublasHandle_t cublas_handle = nullptr;
cublasCreate(&cublas_handle);
cudaStream_t cuda_stream = nullptr;
cudaStreamCreateWithFlags(&cuda_stream, cudaStreamNonBlocking);

// create plugin backend
auto backend = ggml_backend_cuda_init_plugin(device_id, cublas_handle, cuda_stream);

// init weights
std::vector<float> weights_data;
for (int i = 0; i < 10; ++i) {
weights_data.push_back(float(i));
}

void* weights = nullptr;
cudaMallocAsync(&weights, data_size(weights_data), cuda_stream);
cudaMemcpyAsync(weights, weights_data.data(), data_size(weights_data), cudaMemcpyHostToDevice, cuda_stream);

// create model with weights
model m(backend, weights_data.size(), GGML_TYPE_F32, weights);

// init input and output data
std::vector<float> input_data;
for (size_t i = 0; i < weights_data.size(); ++i) {
input_data.push_back(float(i) / 10);
}

std::vector<float> output_data(input_data.size());

void* input = nullptr;
cudaMallocAsync(&input, data_size(input_data), cuda_stream);
cudaMemcpyAsync(input, input_data.data(), data_size(input_data), cudaMemcpyHostToDevice, cuda_stream);

void* output = nullptr;
cudaMallocAsync(&output, data_size(output_data), cuda_stream);

// compute with cuda pointers
m.compute(output, input);

// get data back from cuda pointers
cudaMemcpyAsync(output_data.data(), output, data_size(output_data), cudaMemcpyDeviceToHost, cuda_stream);
cudaStreamSynchronize(cuda_stream);

ggml_backend_free(backend);

// print result
std::cout << "[";
for (auto o : output_data) {
std::cout << o << ", ";
}
std::cout << "]\n";

return 0;
}
67 changes: 67 additions & 0 deletions examples/plugin/model.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
#include "model.hpp"

#include <ggml-alloc.h>
#include <ggml-backend.h>

#include <cassert>

model::model(ggml_backend_t be, int64_t s, ggml_type t, void* weights_data)
: backend(be)
, size(s)
, type(t)
{
assert(weights_data);
static constexpr size_t numWeightTensors = sizeof(weights_t) / sizeof(ggml_tensor*);
wctx = ggml_init({
/*.mem_size =*/ ggml_tensor_overhead() * numWeightTensors,
/*.mem_buffer =*/ nullptr,
/*.no_alloc =*/ true,
});
weights.w = ggml_new_tensor_1d(wctx, type, size);
wbuf = ggml_backend_alloc_buffer(backend, 0);
auto wallocr = ggml_allocr_new_from_buffer(wbuf);
ggml_allocr_set_tensor_external_data(wallocr, weights.w, weights_data, 0);
ggml_allocr_free(wallocr);

cbuf = ggml_backend_alloc_buffer(backend, 0);
callocr = ggml_allocr_new_from_buffer(cbuf);
}

model::~model() {
ggml_free(wctx);
ggml_backend_buffer_free(wbuf);
ggml_allocr_free(callocr);
ggml_backend_buffer_free(cbuf);
}

struct io_tensors {
ggml_tensor* input = nullptr;
ggml_tensor* output = nullptr;
};

void model::compute(void* output, void* input) {
assert(input);
assert(output);

static constexpr size_t num_io_tensors = sizeof(io_tensors) / sizeof(ggml_tensor*);
auto cctx = ggml_init({
/*.mem_size =*/ ggml_tensor_overhead() * num_io_tensors + ggml_graph_overhead(),
/*.mem_buffer =*/ nullptr,
/*.no_alloc =*/ true,
});

io_tensors io = {};
io.input = ggml_new_tensor_1d(cctx, type, size);
io.output = ggml_add(cctx, io.input, weights.w);

ggml_allocr_set_tensor_external_data(callocr, io.input, input, 0);
ggml_allocr_set_tensor_external_data(callocr, io.output, output, 0);

auto graph = ggml_new_graph(cctx);
ggml_build_forward_expand(graph, io.output);

ggml_backend_graph_compute(backend, graph);

ggml_allocr_reset(callocr);
ggml_free(cctx);
}
35 changes: 35 additions & 0 deletions examples/plugin/model.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
#pragma once
#include <ggml.h>
#include <cstdint>

typedef struct ggml_backend* ggml_backend_t;
struct ggml_backend_buffer;
struct ggml_allocr;

struct model {
struct weights_t {
ggml_tensor* w = nullptr;
} weights;

ggml_backend_t backend = nullptr;

ggml_context* wctx = nullptr;
ggml_backend_buffer* wbuf = nullptr; // weights buffer

ggml_backend_buffer* cbuf = nullptr; // compute buffer
ggml_allocr* callocr = nullptr; // compute allocator

const int64_t size;
const ggml_type type;

model(ggml_backend_t be, int64_t s, ggml_type t, void* weights_data);
~model();

void compute(void* output, void* input);
};

// util
template <typename Vec>
size_t data_size(const Vec& vec) {
return vec.size() * sizeof(typename Vec::value_type);
}
7 changes: 7 additions & 0 deletions include/ggml/ggml-alloc.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,13 @@ GGML_API void ggml_allocr_alloc (struct ggml_allocr * alloc, struct ggml_
GGML_API size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph);
GGML_API size_t ggml_allocr_max_size (struct ggml_allocr * alloc);

// set tensor data from external pointer (shallow copy)
// WARNING! It is the responsibility of the user to ensure that the provided pointer:
// * is compatible with the buffer backend (same address space)
// * points to memory of the right size and type/quantization as described by the tensor
// * remains valid while the associated tensor is used
GGML_API void ggml_allocr_set_tensor_external_data(struct ggml_allocr * alloc, struct ggml_tensor * tensor, void * data, size_t data_offset);

GGML_API size_t ggml_allocr_alloc_graph_n(
struct ggml_allocr * alloc,
struct ggml_cgraph ** graphs, int n_graphs,
Expand Down
11 changes: 9 additions & 2 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -249,7 +249,12 @@ if (GGML_PERF)
set(GGML_EXTRA_FLAGS ${GGML_EXTRA_FLAGS} -DGGML_PERF)
endif()

add_library(${TARGET}
if (GGML_PLUGIN)
set(GGML_LIB_TYPE STATIC)
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
endif()

add_library(${TARGET} ${GGML_LIB_TYPE}
ggml.c
ggml-alloc.c
ggml-backend.c
Expand All @@ -261,6 +266,8 @@ add_library(${TARGET}
${GGML_METAL_SOURCES}
)

add_library(ggml::ggml ALIAS ggml)

target_include_directories(${TARGET} PUBLIC
.
../include
Expand All @@ -274,7 +281,7 @@ else()
target_link_libraries(${TARGET} PUBLIC m ${GGML_EXTRA_LIBS} ${CMAKE_THREAD_LIBS_INIT})
endif()

if (BUILD_SHARED_LIBS)
if (BUILD_SHARED_LIBS AND NOT GGML_PLUGIN)
set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)

target_link_libraries(${TARGET} PUBLIC
Expand Down
9 changes: 9 additions & 0 deletions src/ggml-alloc.c
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,15 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
alloc->max_size = MAX(alloc->max_size, (char*)addr - (char*)alloc->data + size);
}

void ggml_allocr_set_tensor_external_data(struct ggml_allocr * alloc, struct ggml_tensor * tensor, void * data, size_t data_offset) {
GGML_ASSERT(!ggml_is_view(tensor)); // views generally get data pointer from one of their sources
GGML_ASSERT(tensor->data == NULL); // avoid allocating tensor which already has memory allocated
GGML_ASSERT(data_offset == 0); // not supported yet
tensor->data = data;
tensor->buffer = alloc->buffer;
ggml_backend_buffer_init_tensor(alloc->buffer, tensor);
}

// this is a very naive implementation, but for our case the number of free blocks should be very small
static void ggml_allocr_free_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
if (ggml_allocr_is_own(alloc, tensor) == false) {
Expand Down
9 changes: 6 additions & 3 deletions src/ggml-backend.c
Original file line number Diff line number Diff line change
Expand Up @@ -231,8 +231,11 @@ static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = {
static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512

static ggml_backend_buffer_t ggml_backend_cpu_alloc_buffer(ggml_backend_t backend, size_t size) {
size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned
void * data = malloc(size); // TODO: maybe use GGML_ALIGNED_MALLOC?
void * data = NULL;
if (size) {
size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned
data = malloc(size); // TODO: maybe use GGML_ALIGNED_MALLOC?
}

return ggml_backend_buffer_init(backend, cpu_backend_buffer_i, data, size);
}
Expand Down Expand Up @@ -364,7 +367,7 @@ ggml_backend_t ggml_backend_cpu_init(void) {

*cpu_backend = (struct ggml_backend) {
/* .interface = */ cpu_backend_i,
/* .context = */ ctx
/* .context = */ ctx,
};
return cpu_backend;
}
Expand Down
Loading