diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index e3404fb8be..b569ecd647 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -28,3 +28,4 @@ add_subdirectory(replit) add_subdirectory(mpt) add_subdirectory(starcoder) add_subdirectory(sam) +add_subdirectory(plugin) diff --git a/examples/plugin/CMakeLists.txt b/examples/plugin/CMakeLists.txt new file mode 100644 index 0000000000..aac1c62ad3 --- /dev/null +++ b/examples/plugin/CMakeLists.txt @@ -0,0 +1,10 @@ +add_library(plugin-model STATIC model.cpp) +target_link_libraries(plugin-model PUBLIC ggml::ggml) + +add_executable(cpu-plugin cpu-plugin.cpp) +target_link_libraries(cpu-plugin plugin-model) + +if (GGML_CUBLAS) + add_executable(cuda-plugin cuda-plugin.cpp) + target_link_libraries(cuda-plugin plugin-model) +endif() diff --git a/examples/plugin/README.md b/examples/plugin/README.md new file mode 100644 index 0000000000..b75db63d43 --- /dev/null +++ b/examples/plugin/README.md @@ -0,0 +1,5 @@ +# GGML Plugin + +This example showcases the use of GGML as a plugin. + +The executables demonstrate how to initialize a backend and run inference with a model whose data comes from the outside. diff --git a/examples/plugin/cpu-plugin.cpp b/examples/plugin/cpu-plugin.cpp new file mode 100644 index 0000000000..2e6db2d3bc --- /dev/null +++ b/examples/plugin/cpu-plugin.cpp @@ -0,0 +1,42 @@ +#include "model.hpp" + +#include +#include + +#include +#include + +int main() { + auto backend = ggml_backend_cpu_init(); + + std::vector weights_data; + for (int i = 0; i < 10; ++i) { + weights_data.push_back(float(i)); + } + + void* weights = weights_data.data(); + + model m(backend, weights_data.size(), GGML_TYPE_F32, weights); + + std::vector input_data; + for (size_t i = 0; i < weights_data.size(); ++i) { + input_data.push_back(float(i) / 10); + } + + std::vector output_data(input_data.size()); + + void* input = input_data.data(); + void* output = output_data.data(); + + m.compute(output, input); + + ggml_backend_free(backend); + + std::cout << "["; + for (auto o : output_data) { + std::cout << o << ", "; + } + std::cout << "]\n"; + + return 0; +} diff --git a/examples/plugin/cuda-plugin.cpp b/examples/plugin/cuda-plugin.cpp new file mode 100644 index 0000000000..8f278a57cd --- /dev/null +++ b/examples/plugin/cuda-plugin.cpp @@ -0,0 +1,67 @@ +#include "model.hpp" + +#include +#include + +#include +#include +#include + +int main() { + // init cuda + int device_id = 0; + cudaSetDevice(device_id); + cublasHandle_t cublas_handle = nullptr; + cublasCreate(&cublas_handle); + cudaStream_t cuda_stream = nullptr; + cudaStreamCreateWithFlags(&cuda_stream, cudaStreamNonBlocking); + + // create plugin backend + auto backend = ggml_backend_cuda_init_plugin(device_id, cublas_handle, cuda_stream); + + // init weights + std::vector weights_data; + for (int i = 0; i < 10; ++i) { + weights_data.push_back(float(i)); + } + + void* weights = nullptr; + cudaMallocAsync(&weights, data_size(weights_data), cuda_stream); + cudaMemcpyAsync(weights, weights_data.data(), data_size(weights_data), cudaMemcpyHostToDevice, cuda_stream); + + // create model with weights + model m(backend, weights_data.size(), GGML_TYPE_F32, weights); + + // init input and output data + std::vector input_data; + for (size_t i = 0; i < weights_data.size(); ++i) { + input_data.push_back(float(i) / 10); + } + + std::vector output_data(input_data.size()); + + void* input = nullptr; + cudaMallocAsync(&input, data_size(input_data), cuda_stream); + cudaMemcpyAsync(input, input_data.data(), data_size(input_data), cudaMemcpyHostToDevice, cuda_stream); + + void* output = nullptr; + cudaMallocAsync(&output, data_size(output_data), cuda_stream); + + // compute with cuda pointers + m.compute(output, input); + + // get data back from cuda pointers + cudaMemcpyAsync(output_data.data(), output, data_size(output_data), cudaMemcpyDeviceToHost, cuda_stream); + cudaStreamSynchronize(cuda_stream); + + ggml_backend_free(backend); + + // print result + std::cout << "["; + for (auto o : output_data) { + std::cout << o << ", "; + } + std::cout << "]\n"; + + return 0; +} diff --git a/examples/plugin/model.cpp b/examples/plugin/model.cpp new file mode 100644 index 0000000000..53583aa4c8 --- /dev/null +++ b/examples/plugin/model.cpp @@ -0,0 +1,68 @@ +#include "model.hpp" + +#include +#include +#include + +#include + +model::model(ggml_backend_t be, int64_t s, ggml_type t, void* weights_data) + : backend(be) + , size(s) + , type(t) +{ + assert(weights_data); + static constexpr size_t numWeightTensors = sizeof(weights_t) / sizeof(ggml_tensor*); + wctx = ggml_init({ + /*.mem_size =*/ ggml_tensor_overhead() * numWeightTensors, + /*.mem_buffer =*/ nullptr, + /*.no_alloc =*/ true, + }); + weights.w = ggml_new_tensor_1d(wctx, type, size); + wbuf = ggml_backend_alloc_buffer(backend, 0); + auto wallocr = ggml_allocr_new_from_buffer(wbuf); + ggml_allocr_set_tensor_external_data(wallocr, weights.w, weights_data, 0); + ggml_allocr_free(wallocr); + + cbuf = ggml_backend_alloc_buffer(backend, 0); + callocr = ggml_allocr_new_from_buffer(cbuf); +} + +model::~model() { + ggml_free(wctx); + ggml_backend_buffer_free(wbuf); + ggml_allocr_free(callocr); + ggml_backend_buffer_free(cbuf); +} + +struct io_tensors { + ggml_tensor* input = nullptr; + ggml_tensor* output = nullptr; +}; + +void model::compute(void* output, void* input) { + assert(input); + assert(output); + + static constexpr size_t num_io_tensors = sizeof(io_tensors) / sizeof(ggml_tensor*); + auto cctx = ggml_init({ + /*.mem_size =*/ ggml_tensor_overhead() * num_io_tensors + ggml_graph_overhead(), + /*.mem_buffer =*/ nullptr, + /*.no_alloc =*/ true, + }); + + io_tensors io = {}; + io.input = ggml_new_tensor_1d(cctx, type, size); + io.output = ggml_add(cctx, io.input, weights.w); + + ggml_allocr_set_tensor_external_data(callocr, io.input, input, 0); + ggml_allocr_set_tensor_external_data(callocr, io.output, output, 0); + + auto graph = ggml_new_graph(cctx); + ggml_build_forward_expand(graph, io.output); + + ggml_backend_graph_compute(backend, graph); + + ggml_allocr_reset(callocr); + ggml_free(cctx); +} diff --git a/examples/plugin/model.hpp b/examples/plugin/model.hpp new file mode 100644 index 0000000000..6b251c774a --- /dev/null +++ b/examples/plugin/model.hpp @@ -0,0 +1,37 @@ +#pragma once +#include + +struct ggml_tensor; +typedef struct ggml_backend* ggml_backend_t; +struct ggml_context; +enum ggml_type; +struct ggml_backend_buffer; +struct ggml_allocr; + +struct model { + struct weights_t { + ggml_tensor* w = nullptr; + } weights; + + ggml_backend_t backend = nullptr; + + ggml_context* wctx = nullptr; + ggml_backend_buffer* wbuf = nullptr; // weights buffer + + ggml_backend_buffer* cbuf = nullptr; // compute buffer + ggml_allocr* callocr = nullptr; // compute allocator + + const int64_t size; + const ggml_type type; + + model(ggml_backend_t be, int64_t s, ggml_type t, void* weights_data); + ~model(); + + void compute(void* output, void* input); +}; + +// util +template +size_t data_size(const Vec& vec) { + return vec.size() * sizeof(typename Vec::value_type); +}