diff --git a/controllers/llamaCPP.cc b/controllers/llamaCPP.cc index 416d22cd7..ea207a220 100644 --- a/controllers/llamaCPP.cc +++ b/controllers/llamaCPP.cc @@ -6,6 +6,13 @@ using namespace inferences; using json = nlohmann::json; +/** + * There is a need to save state of current ongoing inference status of a + * handler, this struct is to solve that issue + * + * @param inst Pointer to the llamaCPP instance this inference task is + * associated with. + */ struct inferenceState { bool is_stopped = false; bool is_streaming = false; @@ -15,15 +22,20 @@ struct inferenceState { inferenceState(llamaCPP *inst) : instance(inst) {} }; +/** + * This function is to create the smart pointer to inferenceState, hence the + * inferenceState will be persisting even tho the lambda in streaming might go + * out of scope and the handler already moved on + */ std::shared_ptr create_inference_state(llamaCPP *instance) { return std::make_shared(instance); } -// -------------------------------------------- - -// Function to check if the model is loaded -void check_model_loaded( - llama_server_context &llama, const HttpRequestPtr &req, +/** + * Check if model already loaded if not return message to user + * @param callback the function to return message to user + */ +void llamaCPP::checkModelLoaded( std::function &callback) { if (!llama.model_loaded_external) { Json::Value jsonResp; @@ -136,7 +148,7 @@ void llamaCPP::warmupModel() { return; } -void llamaCPP::chatCompletionPrelight( +void llamaCPP::handlePrelight( const HttpRequestPtr &req, std::function &&callback) { auto resp = drogon::HttpResponse::newHttpResponse(); @@ -151,10 +163,17 @@ void llamaCPP::chatCompletion( const HttpRequestPtr &req, std::function &&callback) { + const auto &jsonBody = req->getJsonObject(); // Check if model is loaded - check_model_loaded(llama, req, callback); + checkModelLoaded(callback); + + chatCompletionImpl(jsonBody, callback); +} + +void llamaCPP::chatCompletionImpl( + std::shared_ptr jsonBody, + std::function &callback) { - const auto &jsonBody = req->getJsonObject(); std::string formatted_output = pre_prompt; json data; @@ -402,17 +421,23 @@ void llamaCPP::chatCompletion( } } } + void llamaCPP::embedding( const HttpRequestPtr &req, std::function &&callback) { - check_model_loaded(llama, req, callback); + checkModelLoaded(callback); + const auto &jsonBody = req->getJsonObject(); - auto state = create_inference_state(this); + embeddingImpl(jsonBody, callback); + return; +} - const auto &jsonBody = req->getJsonObject(); +void llamaCPP::embeddingImpl( + std::shared_ptr jsonBody, + std::function &callback) { Json::Value responseData(Json::arrayValue); - + auto state = create_inference_state(this); if (jsonBody->isMember("input")) { // If single queue is busy, we will wait if not we will just go ahead and // process and make it busy, and yet i'm aware not DRY, i have the same @@ -464,7 +489,6 @@ void llamaCPP::embedding( resp->setBody(Json::writeString(Json::StreamWriterBuilder(), root)); resp->setContentTypeString("application/json"); callback(resp); - return; } void llamaCPP::unloadModel( @@ -501,31 +525,61 @@ void llamaCPP::modelStatus( callback(resp); return; } +void llamaCPP::loadModel( + const HttpRequestPtr &req, + std::function &&callback) { -bool llamaCPP::loadModelImpl(const Json::Value &jsonBody) { + if (llama.model_loaded_external) { + LOG_INFO << "model loaded"; + Json::Value jsonResp; + jsonResp["message"] = "Model already loaded"; + auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp); + resp->setStatusCode(drogon::k409Conflict); + callback(resp); + return; + } - gpt_params params; + const auto &jsonBody = req->getJsonObject(); + if (!loadModelImpl(jsonBody)) { + // Error occurred during model loading + Json::Value jsonResp; + jsonResp["message"] = "Failed to load model"; + auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp); + resp->setStatusCode(drogon::k500InternalServerError); + callback(resp); + } else { + // Model loaded successfully + Json::Value jsonResp; + jsonResp["message"] = "Model loaded successfully"; + auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp); + callback(resp); + } +} + +bool llamaCPP::loadModelImpl(std::shared_ptr jsonBody) { + gpt_params params; // By default will setting based on number of handlers if (jsonBody) { - if (!jsonBody["mmproj"].isNull()) { + if (!jsonBody->operator[]("mmproj").isNull()) { LOG_INFO << "MMPROJ FILE detected, multi-model enabled!"; - params.mmproj = jsonBody["mmproj"].asString(); + params.mmproj = jsonBody->operator[]("mmproj").asString(); } - if (!jsonBody["grp_attn_n"].isNull()) { + if (!jsonBody->operator[]("grp_attn_n").isNull()) { - params.grp_attn_n = jsonBody["grp_attn_n"].asInt(); + params.grp_attn_n = jsonBody->operator[]("grp_attn_n").asInt(); } - if (!jsonBody["grp_attn_w"].isNull()) { + if (!jsonBody->operator[]("grp_attn_w").isNull()) { - params.grp_attn_w = jsonBody["grp_attn_w"].asInt(); + params.grp_attn_w = jsonBody->operator[]("grp_attn_w").asInt(); } - if (!jsonBody["mlock"].isNull()) { - params.use_mlock = jsonBody["mlock"].asBool(); + if (!jsonBody->operator[]("mlock").isNull()) { + params.use_mlock = jsonBody->operator[]("mlock").asBool(); } - if (!jsonBody["grammar_file"].isNull()) { - std::string grammar_file = jsonBody["grammar_file"].asString(); + if (!jsonBody->operator[]("grammar_file").isNull()) { + std::string grammar_file = + jsonBody->operator[]("grammar_file").asString(); std::ifstream file(grammar_file); if (!file) { LOG_ERROR << "Grammar file not found"; @@ -536,30 +590,31 @@ bool llamaCPP::loadModelImpl(const Json::Value &jsonBody) { } }; - params.model = jsonBody["llama_model_path"].asString(); - params.n_gpu_layers = jsonBody.get("ngl", 100).asInt(); - params.n_ctx = jsonBody.get("ctx_len", 2048).asInt(); - params.embedding = jsonBody.get("embedding", true).asBool(); + params.model = jsonBody->operator[]("llama_model_path").asString(); + params.n_gpu_layers = jsonBody->get("ngl", 100).asInt(); + params.n_ctx = jsonBody->get("ctx_len", 2048).asInt(); + params.embedding = jsonBody->get("embedding", true).asBool(); // Check if n_parallel exists in jsonBody, if not, set to drogon_thread - params.n_batch = jsonBody.get("n_batch", 512).asInt(); - params.n_parallel = jsonBody.get("n_parallel", 1).asInt(); + params.n_batch = jsonBody->get("n_batch", 512).asInt(); + params.n_parallel = jsonBody->get("n_parallel", 1).asInt(); params.n_threads = - jsonBody.get("cpu_threads", std::thread::hardware_concurrency()) + jsonBody->get("cpu_threads", std::thread::hardware_concurrency()) .asInt(); - params.cont_batching = jsonBody.get("cont_batching", false).asBool(); + params.cont_batching = jsonBody->get("cont_batching", false).asBool(); this->clean_cache_threshold = - jsonBody.get("clean_cache_threshold", 5).asInt(); - this->caching_enabled = jsonBody.get("caching_enabled", false).asBool(); - this->user_prompt = jsonBody.get("user_prompt", "USER: ").asString(); - this->ai_prompt = jsonBody.get("ai_prompt", "ASSISTANT: ").asString(); + jsonBody->get("clean_cache_threshold", 5).asInt(); + this->caching_enabled = jsonBody->get("caching_enabled", false).asBool(); + this->user_prompt = jsonBody->get("user_prompt", "USER: ").asString(); + this->ai_prompt = jsonBody->get("ai_prompt", "ASSISTANT: ").asString(); this->system_prompt = - jsonBody.get("system_prompt", "ASSISTANT's RULE: ").asString(); - this->pre_prompt = jsonBody.get("pre_prompt", "").asString(); - this->repeat_last_n = jsonBody.get("repeat_last_n", 32).asInt(); + jsonBody->get("system_prompt", "ASSISTANT's RULE: ").asString(); + this->pre_prompt = jsonBody->get("pre_prompt", "").asString(); + this->repeat_last_n = jsonBody->get("repeat_last_n", 32).asInt(); - if (!jsonBody["llama_log_folder"].isNull()) { + if (!jsonBody->operator[]("llama_log_folder").isNull()) { log_enable(); - std::string llama_log_folder = jsonBody["llama_log_folder"].asString(); + std::string llama_log_folder = + jsonBody->operator[]("llama_log_folder").asString(); log_set_target(llama_log_folder + "llama.log"); } // Set folder for llama log } @@ -597,37 +652,6 @@ bool llamaCPP::loadModelImpl(const Json::Value &jsonBody) { return true; } -void llamaCPP::loadModel( - const HttpRequestPtr &req, - std::function &&callback) { - - if (llama.model_loaded_external) { - LOG_INFO << "model loaded"; - Json::Value jsonResp; - jsonResp["message"] = "Model already loaded"; - auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp); - resp->setStatusCode(drogon::k409Conflict); - callback(resp); - return; - } - - const auto &jsonBody = req->getJsonObject(); - if (!loadModelImpl(*jsonBody)) { - // Error occurred during model loading - Json::Value jsonResp; - jsonResp["message"] = "Failed to load model"; - auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp); - resp->setStatusCode(drogon::k500InternalServerError); - callback(resp); - } else { - // Model loaded successfully - Json::Value jsonResp; - jsonResp["message"] = "Model loaded successfully"; - auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp); - callback(resp); - } -} - void llamaCPP::backgroundTask() { while (llama.model_loaded_external) { // model_loaded = diff --git a/controllers/llamaCPP.h b/controllers/llamaCPP.h index a9581da21..adc179489 100644 --- a/controllers/llamaCPP.h +++ b/controllers/llamaCPP.h @@ -2530,36 +2530,26 @@ class llamaCPP : public drogon::HttpController { // Openai compatible path ADD_METHOD_TO(llamaCPP::chatCompletion, "/v1/chat/completions", Post); - ADD_METHOD_TO(llamaCPP::chatCompletionPrelight, "/v1/chat/completions", - Options); + ADD_METHOD_TO(llamaCPP::handlePrelight, "/v1/chat/completions", Options); ADD_METHOD_TO(llamaCPP::embedding, "/v1/embeddings", Post); + ADD_METHOD_TO(llamaCPP::handlePrelight, "/v1/embeddings", Options); // PATH_ADD("/llama/chat_completion", Post); METHOD_LIST_END void chatCompletion(const HttpRequestPtr &req, std::function &&callback); - void chatCompletionPrelight( - const HttpRequestPtr &req, - std::function &&callback); + void handlePrelight(const HttpRequestPtr &req, + std::function &&callback); void embedding(const HttpRequestPtr &req, std::function &&callback); void loadModel(const HttpRequestPtr &req, std::function &&callback); void unloadModel(const HttpRequestPtr &req, std::function &&callback); - void modelStatus(const HttpRequestPtr &req, std::function &&callback); - bool loadModelImpl(const Json::Value &jsonBody); - - void warmupModel(); - - void backgroundTask(); - - void stopBackgroundTask(); - private: llama_server_context llama; // std::atomic model_loaded = false; @@ -2577,5 +2567,16 @@ class llamaCPP : public drogon::HttpController { std::atomic single_queue_is_busy; // This value only used under the // condition n_parallel is 1 std::string grammar_file_content; + + bool loadModelImpl(std::shared_ptr jsonBody); + void + chatCompletionImpl(std::shared_ptr jsonBody, + std::function &callback); + void embeddingImpl(std::shared_ptr jsonBody, + std::function &callback); + void checkModelLoaded(std::function &callback); + void warmupModel(); + void backgroundTask(); + void stopBackgroundTask(); }; }; // namespace inferences