diff --git a/CMakeLists.txt b/CMakeLists.txt index 89e3a88fe..39c988e09 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -51,7 +51,7 @@ add_executable(${PROJECT_NAME} main.cc) # # and comment out the following lines find_package(Drogon CONFIG REQUIRED) -target_link_libraries(${PROJECT_NAME} PRIVATE Drogon::Drogon common llama clip +target_link_libraries(${PROJECT_NAME} PRIVATE Drogon::Drogon common llama llava ${CMAKE_THREAD_LIBS_INIT}) # ############################################################################## diff --git a/controllers/llamaCPP.cc b/controllers/llamaCPP.cc index f7b738724..d8fffff46 100644 --- a/controllers/llamaCPP.cc +++ b/controllers/llamaCPP.cc @@ -110,7 +110,7 @@ void llamaCPP::chatCompletion( data["stop"] = stopWords; } - const int task_id = llama.request_completion(data, false); + const int task_id = llama.request_completion(data, false,false); LOG_INFO << "Resolved request for task_id:" << task_id; auto state = createState(task_id, this); @@ -178,7 +178,7 @@ void llamaCPP::embedding( prompt = ""; } const int task_id = - llama.request_completion({{"prompt", prompt}, {"n_predict", 0}}, false); + llama.request_completion({{"prompt", prompt}, {"n_predict", 0}}, false, true); task_result result = llama.next_result(task_id); std::string embeddingResp = result.result_json.dump(); auto resp = nitro_utils::nitroHttpResponse(); @@ -226,8 +226,8 @@ void llamaCPP::loadModel( llama_backend_init(params.numa); - LOG_INFO_LLAMA("build info", - {{"build", BUILD_NUMBER}, {"commit", BUILD_COMMIT}}); + // LOG_INFO_LLAMA("build info", + // {{"build", BUILD_NUMBER}, {"commit", BUILD_COMMIT}}); LOG_INFO_LLAMA("system info", { {"n_threads", params.n_threads}, diff --git a/controllers/llamaCPP.h b/controllers/llamaCPP.h index 01fcab840..0c0c1517f 100644 --- a/controllers/llamaCPP.h +++ b/controllers/llamaCPP.h @@ -15,11 +15,9 @@ #include // External - -#include "build-info.h" #include "common.h" -#include "grammar-parser.h" #include "llama.h" +#include "grammar-parser.h" #include "../../llama.cpp/examples/llava/clip.h" @@ -34,10 +32,10 @@ // auto generated files (update with ./deps.sh) -#include #include -#include #include +#include +#include #ifndef SERVER_VERBOSE #define SERVER_VERBOSE 1 @@ -45,12 +43,13 @@ using json = nlohmann::json; -struct server_params { - std::string hostname = "127.0.0.1"; - std::string public_path = "examples/server/public"; - int32_t port = 8080; - int32_t read_timeout = 600; - int32_t write_timeout = 600; +struct server_params +{ + std::string hostname = "127.0.0.1"; + std::string public_path = "examples/server/public"; + int32_t port = 8080; + int32_t read_timeout = 600; + int32_t write_timeout = 600; }; static bool server_verbose = false; @@ -58,2002 +57,2188 @@ static bool server_verbose = false; #if SERVER_VERBOSE != 1 #define LOG_VERBOSE(MSG, ...) #else -#define LOG_VERBOSE(MSG, ...) \ - do { \ - if (server_verbose) { \ - server_log("VERBOSE", __func__, __LINE__, MSG, __VA_ARGS__); \ - } \ - } while (0) +#define LOG_VERBOSE(MSG, ...) \ + do \ + { \ + if (server_verbose) \ + { \ + server_log("VERBOSE", __func__, __LINE__, MSG, __VA_ARGS__); \ + } \ + } while (0) #endif -#define LOG_ERROR_LLAMA(MSG, ...) \ - server_log("ERROR", __func__, __LINE__, MSG, __VA_ARGS__) -#define LOG_WARNING_LLAMA(MSG, ...) \ - server_log("WARNING", __func__, __LINE__, MSG, __VA_ARGS__) -#define LOG_INFO_LLAMA(MSG, ...) \ - server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__) +#define LOG_ERROR_LLAMA( MSG, ...) server_log("ERROR", __func__, __LINE__, MSG, __VA_ARGS__) +#define LOG_WARNING_LLAMA(MSG, ...) server_log("WARNING", __func__, __LINE__, MSG, __VA_ARGS__) +#define LOG_INFO_LLAMA( MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__) // // base64 utils (TODO: move to common in the future) // -static const std::string base64_chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" - "abcdefghijklmnopqrstuvwxyz" - "0123456789+/"; +static const std::string base64_chars = + "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + "abcdefghijklmnopqrstuvwxyz" + "0123456789+/"; -static inline bool is_base64(uint8_t c) { - return (isalnum(c) || (c == '+') || (c == '/')); +static inline bool is_base64(uint8_t c) +{ + return (isalnum(c) || (c == '+') || (c == '/')); } -static std::vector base64_decode(std::string const &encoded_string) { - int i = 0; - int j = 0; - int in_ = 0; - - int in_len = encoded_string.size(); - - uint8_t char_array_4[4]; - uint8_t char_array_3[3]; - - std::vector ret; - - while (in_len-- && (encoded_string[in_] != '=') && - is_base64(encoded_string[in_])) { - char_array_4[i++] = encoded_string[in_]; - in_++; - if (i == 4) { - for (i = 0; i < 4; i++) { - char_array_4[i] = base64_chars.find(char_array_4[i]); - } - - char_array_3[0] = - ((char_array_4[0]) << 2) + ((char_array_4[1] & 0x30) >> 4); - char_array_3[1] = - ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2); - char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3]; - - for (i = 0; (i < 3); i++) { - ret.push_back(char_array_3[i]); - } - i = 0; - } - } +static std::vector base64_decode(std::string const &encoded_string) +{ + int i = 0; + int j = 0; + int in_ = 0; - if (i) { - for (j = i; j < 4; j++) { - char_array_4[j] = 0; - } + int in_len = encoded_string.size(); + + uint8_t char_array_4[4]; + uint8_t char_array_3[3]; + + std::vector ret; + + while (in_len-- && (encoded_string[in_] != '=') && is_base64(encoded_string[in_])) + { + char_array_4[i++] = encoded_string[in_]; in_++; + if (i == 4) + { + for (i = 0; i <4; i++) + { + char_array_4[i] = base64_chars.find(char_array_4[i]); + } - for (j = 0; j < 4; j++) { - char_array_4[j] = base64_chars.find(char_array_4[j]); + char_array_3[0] = ((char_array_4[0] ) << 2) + ((char_array_4[1] & 0x30) >> 4); + char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2); + char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3]; + + for (i = 0; (i < 3); i++) + { + ret.push_back(char_array_3[i]); + } + i = 0; + } } - char_array_3[0] = - ((char_array_4[0]) << 2) + ((char_array_4[1] & 0x30) >> 4); - char_array_3[1] = - ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2); - char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3]; + if (i) + { + for (j = i; j <4; j++) + { + char_array_4[j] = 0; + } + + for (j = 0; j <4; j++) + { + char_array_4[j] = base64_chars.find(char_array_4[j]); + } + + char_array_3[0] = ((char_array_4[0] ) << 2) + ((char_array_4[1] & 0x30) >> 4); + char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2); + char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3]; - for (j = 0; (j < i - 1); j++) { - ret.push_back(char_array_3[j]); + for (j = 0; (j < i - 1); j++) + { + ret.push_back(char_array_3[j]); + } } - } - return ret; + return ret; } // // parallel // -enum task_type { COMPLETION_TASK, CANCEL_TASK }; +enum task_type { + COMPLETION_TASK, + CANCEL_TASK +}; struct task_server { - int id; - int target_id; - task_type type; - json data; - bool infill_mode = false; + int id; + int target_id; + task_type type; + json data; + bool infill_mode = false; + bool embedding_mode = false; }; struct task_result { - int id; - bool stop; - bool error; - json result_json; + int id; + bool stop; + bool error; + json result_json; }; // TODO: can become bool if we can't find use of more states -enum slot_state { - IDLE, - PROCESSING, +enum slot_state +{ + IDLE, + PROCESSING, }; -enum slot_command { - NONE, - LOAD_PROMPT, - RELEASE, +enum slot_command +{ + NONE, + LOAD_PROMPT, + RELEASE, }; -struct slot_params { - bool stream = true; - bool cache_prompt = - false; // remember the prompt to avoid reprocessing all prompt +struct slot_params +{ + bool stream = true; + bool cache_prompt = false; // remember the prompt to avoid reprocessing all prompt - uint32_t seed = -1; // RNG seed - int32_t n_keep = 0; // number of tokens to keep from initial prompt - int32_t n_predict = -1; // new tokens to predict + uint32_t seed = -1; // RNG seed + int32_t n_keep = 0; // number of tokens to keep from initial prompt + int32_t n_predict = -1; // new tokens to predict - std::vector antiprompt; + std::vector antiprompt; - json input_prefix; - json input_suffix; + json input_prefix; + json input_suffix; }; -struct slot_image { - int32_t id; +struct slot_image +{ + int32_t id; - bool request_encode_image = false; - float *image_embedding = nullptr; - int32_t image_tokens = 0; + bool request_encode_image = false; + float* image_embedding = nullptr; + int32_t image_tokens = 0; - clip_image_u8 img_data; + clip_image_u8 img_data; - std::string prefix_prompt; // before of this image + std::string prefix_prompt; // before of this image }; // completion token output with probabilities -struct completion_token_output { - struct token_prob { - llama_token tok; - float prob; - }; +struct completion_token_output +{ + struct token_prob + { + llama_token tok; + float prob; + }; - std::vector probs; - llama_token tok; - std::string text_to_send; + std::vector probs; + llama_token tok; + std::string text_to_send; }; -static size_t common_part(const std::vector &a, - const std::vector &b) { - size_t i; - for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) { - } - return i; +static size_t common_part(const std::vector &a, const std::vector &b) +{ + size_t i; + for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) + { + } + return i; } -enum stop_type { - STOP_FULL, - STOP_PARTIAL, +enum stop_type +{ + STOP_FULL, + STOP_PARTIAL, }; -static bool ends_with(const std::string &str, const std::string &suffix) { - return str.size() >= suffix.size() && - 0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix); +static bool ends_with(const std::string &str, const std::string &suffix) +{ + return str.size() >= suffix.size() && + 0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix); } static size_t find_partial_stop_string(const std::string &stop, - const std::string &text) { - if (!text.empty() && !stop.empty()) { - const char text_last_char = text.back(); - for (int64_t char_index = stop.size() - 1; char_index >= 0; char_index--) { - if (stop[char_index] == text_last_char) { - const std::string current_partial = stop.substr(0, char_index + 1); - if (ends_with(text, current_partial)) { - return text.size() - char_index - 1; - } - } + const std::string &text) +{ + if (!text.empty() && !stop.empty()) + { + const char text_last_char = text.back(); + for (int64_t char_index = stop.size() - 1; char_index >= 0; char_index--) + { + if (stop[char_index] == text_last_char) + { + const std::string current_partial = stop.substr(0, char_index + 1); + if (ends_with(text, current_partial)) + { + return text.size() - char_index - 1; + } + } + } } - } - return std::string::npos; + return std::string::npos; } // TODO: reuse llama_detokenize template -static std::string tokens_to_str(llama_context *ctx, Iter begin, Iter end) { - std::string ret; - for (; begin != end; ++begin) { - ret += llama_token_to_piece(ctx, *begin); - } - return ret; +static std::string tokens_to_str(llama_context *ctx, Iter begin, Iter end) +{ + std::string ret; + for (; begin != end; ++begin) + { + ret += llama_token_to_piece(ctx, *begin); + } + return ret; } static void server_log(const char *level, const char *function, int line, - const char *message, - const nlohmann::ordered_json &extra) { - nlohmann::ordered_json log{ - {"timestamp", time(nullptr)}, {"level", level}, - {"function", function}, {"line", line}, - {"message", message}, - }; - - if (!extra.empty()) { - log.merge_patch(extra); - } + const char *message, const nlohmann::ordered_json &extra) +{ + nlohmann::ordered_json log + { + {"timestamp", time(nullptr)}, + {"level", level}, + {"function", function}, + {"line", line}, + {"message", message}, + }; + + if (!extra.empty()) + { + log.merge_patch(extra); + } - const std::string str = - log.dump(-1, ' ', false, json::error_handler_t::replace); - LOG_INFO << str; + const std::string str = log.dump(-1, ' ', false, json::error_handler_t::replace); + LOG_INFO << str; } // format incomplete utf-8 multibyte character for output -static std::string tokens_to_output_formatted_string(const llama_context *ctx, - const llama_token token) { - std::string out = token == -1 ? "" : llama_token_to_piece(ctx, token); - // if the size is 1 and first bit is 1, meaning it's a partial character - // (size > 1 meaning it's already a known token) - if (out.size() == 1 && (out[0] & 0x80) == 0x80) { - std::stringstream ss; - ss << std::hex << (out[0] & 0xff); - std::string res(ss.str()); - out = "byte: \\x" + res; - } - return out; +static std::string tokens_to_output_formatted_string(const llama_context *ctx, const llama_token token) +{ + std::string out = token == -1 ? "" : llama_token_to_piece(ctx, token); + // if the size is 1 and first bit is 1, meaning it's a partial character + // (size > 1 meaning it's already a known token) + if (out.size() == 1 && (out[0] & 0x80) == 0x80) + { + std::stringstream ss; + ss << std::hex << (out[0] & 0xff); + std::string res(ss.str()); + out = "byte: \\x" + res; + } + return out; } // convert a vector of completion_token_output to json -static json -probs_vector_to_json(const llama_context *ctx, - const std::vector &probs) { - json out = json::array(); - for (const auto &prob : probs) { - json probs_for_token = json::array(); - for (const auto &p : prob.probs) { - std::string tok_str = tokens_to_output_formatted_string(ctx, p.tok); - probs_for_token.push_back(json{ - {"tok_str", tok_str}, - {"prob", p.prob}, - }); +static json probs_vector_to_json(const llama_context *ctx, const std::vector &probs) +{ + json out = json::array(); + for (const auto &prob : probs) + { + json probs_for_token = json::array(); + for (const auto &p : prob.probs) + { + std::string tok_str = tokens_to_output_formatted_string(ctx, p.tok); + probs_for_token.push_back(json + { + {"tok_str", tok_str}, + {"prob", p.prob}, + }); + } + std::string tok_str = tokens_to_output_formatted_string(ctx, prob.tok); + out.push_back(json{ + {"content", tok_str}, + {"probs", probs_for_token}, + }); } - std::string tok_str = tokens_to_output_formatted_string(ctx, prob.tok); - out.push_back(json{ - {"content", tok_str}, - {"probs", probs_for_token}, - }); - } - return out; + return out; } template -static T json_value(const json &body, const std::string &key, - const T &default_value) { - // Fallback null to default value - return body.contains(key) && !body.at(key).is_null() - ? body.value(key, default_value) - : default_value; +static T json_value(const json &body, const std::string &key, const T &default_value) +{ + // Fallback null to default value + return body.contains(key) && !body.at(key).is_null() + ? body.value(key, default_value) + : default_value; } -struct llama_client_slot { - int id; - int task_id = -1; - - struct slot_params params; +struct llama_client_slot +{ + int id; + int task_id = -1; - slot_state state = IDLE; - slot_command command = NONE; + struct slot_params params; - // used to determine the slot that has been used the longest - int64_t t_last_used = -1; + slot_state state = IDLE; + slot_command command = NONE; - // generation props - int32_t n_ctx = 0; // context size per slot - int32_t n_past = 0; - int32_t n_decoded = 0; - int32_t n_remaining = -1; - int32_t i_batch = -1; + // used to determine the slot that has been used the longest + int64_t t_last_used = -1; - int32_t num_prompt_tokens = 0; - int32_t num_prompt_tokens_processed = 0; - int32_t multibyte_pending = 0; + // generation props + int32_t n_ctx = 0; // context size per slot + int32_t n_past = 0; + int32_t n_decoded = 0; + int32_t n_remaining = -1; + int32_t i_batch = -1; - json prompt; - std::string generated_text; - llama_token sampled; - std::vector cache_tokens; - std::vector generated_token_probs; + int32_t num_prompt_tokens = 0; + int32_t num_prompt_tokens_processed = 0; + int32_t multibyte_pending = 0; - bool infill = false; - bool has_next_token = true; - bool truncated = false; - bool stopped_eos = false; - bool stopped_word = false; - bool stopped_limit = false; + json prompt; + std::string generated_text; + llama_token sampled; + std::vector cache_tokens; + std::vector generated_token_probs; - std::string stopping_word; + bool infill = false; + bool embedding = false; + bool has_next_token = true; + bool truncated = false; + bool stopped_eos = false; + bool stopped_word = false; + bool stopped_limit = false; - // sampling - struct llama_sampling_params sparams; - llama_sampling_context *ctx_sampling = nullptr; + std::string stopping_word; - // multimodal - std::vector images; - - // stats - size_t sent_count = 0; - size_t sent_token_probs_index = 0; + // sampling + struct llama_sampling_params sparams; + llama_sampling_context *ctx_sampling = nullptr; + + // multimodal + std::vector images; + + // stats + size_t sent_count = 0; + size_t sent_token_probs_index = 0; + + int64_t t_start_process_prompt; + int64_t t_start_genereration; + + double t_prompt_processing; // ms + double t_token_generation; // ms + + void reset() { + num_prompt_tokens = 0; + generated_text = ""; + truncated = false; + stopped_eos = false; + stopped_word = false; + stopped_limit = false; + stopping_word = ""; + multibyte_pending = 0; + n_past = 0; + sent_count = 0; + sent_token_probs_index = 0; + infill = false; + + generated_token_probs.clear(); + + for (slot_image &img : images) + { + free(img.image_embedding); + delete[] img.img_data.data; + img.prefix_prompt = ""; + } - int64_t t_start_process_prompt; - int64_t t_start_genereration; - - double t_prompt_processing; // ms - double t_token_generation; // ms - - void reset() { - num_prompt_tokens = 0; - generated_text = ""; - truncated = false; - stopped_eos = false; - stopped_word = false; - stopped_limit = false; - stopping_word = ""; - multibyte_pending = 0; - n_past = 0; - sent_count = 0; - sent_token_probs_index = 0; - infill = false; - - generated_token_probs.clear(); - - for (slot_image &img : images) { - free(img.image_embedding); - delete[] img.img_data.data; - img.prefix_prompt = ""; + images.clear(); + // llama_set_rng_seed(ctx, params.seed); in batched the seed matter??????? } - images.clear(); - // llama_set_rng_seed(ctx, params.seed); in batched the seed matter??????? - } - - bool has_budget(gpt_params &global_params) { - n_remaining = -1; - if (params.n_predict != -1) { - n_remaining = params.n_predict - n_decoded; - } else if (global_params.n_predict != -1) { - n_remaining = global_params.n_predict - n_decoded; + bool has_budget(gpt_params &global_params) { + n_remaining = -1; + if(params.n_predict != -1) + { + n_remaining = params.n_predict - n_decoded; + } + else if (global_params.n_predict != -1) + { + n_remaining = global_params.n_predict - n_decoded; + } + return n_remaining > 0 || n_remaining == -1; // no budget || limitless } - return n_remaining > 0 || n_remaining == -1; // no budget || limitless - } - bool available() const { return state == IDLE && command == NONE; } + bool available() const { + return state == IDLE && command == NONE; + } - bool is_processing() const { - return (state == IDLE && command == LOAD_PROMPT) || state == PROCESSING; - } + bool is_processing() const { + return (state == IDLE && command == LOAD_PROMPT) || state == PROCESSING; + } - void add_token_string(const completion_token_output &token) { - if (command == RELEASE) { - return; + void add_token_string(const completion_token_output &token) { + if (command == RELEASE) + { + return; + } + cache_tokens.push_back(token.tok); + generated_token_probs.push_back(token); } - cache_tokens.push_back(token.tok); - generated_token_probs.push_back(token); - } - void release() { - if (state == IDLE || state == PROCESSING) { - t_token_generation = (ggml_time_us() - t_start_genereration) / 1e3; - command = RELEASE; + void release() { + if (state == IDLE || state == PROCESSING) + { + t_token_generation = (ggml_time_us() - t_start_genereration) / 1e3; + command = RELEASE; + } } - } - json get_formated_timings() { - return json{ - {"prompt_n", num_prompt_tokens_processed}, - {"prompt_ms", t_prompt_processing}, - {"prompt_per_token_ms", - t_prompt_processing / num_prompt_tokens_processed}, - {"prompt_per_second", - 1e3 / t_prompt_processing * num_prompt_tokens_processed}, - - {"predicted_n", n_decoded}, - {"predicted_ms", t_token_generation}, - {"predicted_per_token_ms", t_token_generation / n_decoded}, - {"predicted_per_second", 1e3 / t_token_generation * n_decoded}, - }; - } + json get_formated_timings() { + return json + { + {"prompt_n", num_prompt_tokens_processed}, + {"prompt_ms", t_prompt_processing}, + {"prompt_per_token_ms", t_prompt_processing / num_prompt_tokens_processed}, + {"prompt_per_second", 1e3 / t_prompt_processing * num_prompt_tokens_processed}, + + {"predicted_n", n_decoded}, + {"predicted_ms", t_token_generation}, + {"predicted_per_token_ms", t_token_generation / n_decoded}, + {"predicted_per_second", 1e3 / t_token_generation * n_decoded}, + }; + } - void print_timings() { - LOG_TEE("\n"); - LOG_TEE("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per " - "token, %8.2f tokens per second)\n", - __func__, t_prompt_processing, num_prompt_tokens_processed, - t_prompt_processing / num_prompt_tokens_processed, - 1e3 / t_prompt_processing * num_prompt_tokens_processed); - LOG_TEE("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per " - "token, %8.2f tokens per second)\n", - __func__, t_token_generation, n_decoded, - t_token_generation / n_decoded, - 1e3 / t_token_generation * n_decoded); - LOG_TEE("%s: total time = %10.2f ms\n", __func__, - t_prompt_processing + t_token_generation); - } + void print_timings() { + LOG_TEE("\n"); + LOG_TEE("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n", + __func__, t_prompt_processing, num_prompt_tokens_processed, t_prompt_processing / num_prompt_tokens_processed, 1e3 / t_prompt_processing * num_prompt_tokens_processed); + LOG_TEE("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n", + __func__, t_token_generation, n_decoded,t_token_generation / n_decoded, 1e3 / t_token_generation * n_decoded); + LOG_TEE("%s: total time = %10.2f ms\n", __func__, t_prompt_processing + t_token_generation); + } }; -struct llama_server_context { - llama_model *model = nullptr; - llama_context *ctx = nullptr; +struct llama_server_context +{ + llama_model *model = nullptr; + llama_context *ctx = nullptr; - clip_ctx *clp_ctx = nullptr; + clip_ctx *clp_ctx = nullptr; - gpt_params params; + gpt_params params; - llama_batch batch; + llama_batch batch; - bool multimodal = false; - bool clean_kv_cache = true; - bool all_slots_are_idle = false; + bool multimodal = false; + bool clean_kv_cache = true; + bool all_slots_are_idle = false; - int32_t id_gen; - int32_t n_ctx; // total context for all clients / slots + int32_t id_gen; + int32_t n_ctx; // total context for all clients / slots - // system prompt - bool system_need_update = false; + // system prompt + bool system_need_update = false; - std::string system_prompt; - std::vector system_tokens; + std::string system_prompt; + std::vector system_tokens; - std::string name_user; // this should be the antiprompt - std::string name_assistant; + std::string name_user; // this should be the antiprompt + std::string name_assistant; - // slots / clients - std::vector slots; + // slots / clients + std::vector slots; - std::vector queue_tasks; - std::vector queue_results; - std::mutex mutex_tasks; - std::mutex mutex_results; + std::vector queue_tasks; + std::vector queue_results; + std::mutex mutex_tasks; + std::mutex mutex_results; - ~llama_server_context() { - if (ctx) { - llama_free(ctx); - ctx = nullptr; - } - if (model) { - llama_free_model(model); - model = nullptr; + ~llama_server_context() + { + if (ctx) + { + llama_free(ctx); + ctx = nullptr; + } + if (model) + { + llama_free_model(model); + model = nullptr; + } } - } - bool load_model(const gpt_params ¶ms_) { - params = params_; - if (!params.mmproj.empty()) { - multimodal = true; - LOG_TEE("Multi Modal Mode Enabled"); - clp_ctx = clip_model_load(params.mmproj.c_str(), /*verbosity=*/1); - if (clp_ctx == nullptr) { - LOG_ERROR_LLAMA("unable to load clip model", - {{"model", params.mmproj}}); - return false; - } - - if (params.n_ctx < - 2048) { // request larger context for the image embedding - params.n_ctx = 2048; - } - } + bool load_model(const gpt_params ¶ms_) + { + params = params_; + if (!params.mmproj.empty()) { + multimodal = true; + LOG_TEE("Multi Modal Mode Enabled"); + clp_ctx = clip_model_load(params.mmproj.c_str(), /*verbosity=*/ 1); + if(clp_ctx == nullptr) { + LOG_ERROR_LLAMA("unable to load clip model", {{"model", params.mmproj}}); + return false; + } - std::tie(model, ctx) = llama_init_from_gpt_params(params); - if (model == nullptr) { - LOG_ERROR_LLAMA("unable to load model", {{"model", params.model}}); - return false; - } + if (params.n_ctx < 2048) { // request larger context for the image embedding + params.n_ctx = 2048; + } + } + + std::tie(model, ctx) = llama_init_from_gpt_params(params); + if (model == nullptr) + { + LOG_ERROR_LLAMA("unable to load model", {{"model", params.model}}); + return false; + } + + if (multimodal) { + const int n_embd_clip = clip_n_mmproj_embd(clp_ctx); + const int n_embd_llm = llama_n_embd(model); + if (n_embd_clip != n_embd_llm) { + LOG_TEE("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_embd_clip, n_embd_llm); + llama_free(ctx); + llama_free_model(model); + return false; + } + } + + n_ctx = llama_n_ctx(ctx); - if (multimodal) { - const int n_embd_clip = clip_n_mmproj_embd(clp_ctx); - const int n_embd_llm = llama_n_embd(model); - if (n_embd_clip != n_embd_llm) { - LOG_TEE("%s: embedding dim of the multimodal projector (%d) is not " - "equal to that of LLaMA (%d). Make sure that you use the " - "correct mmproj file.\n", - __func__, n_embd_clip, n_embd_llm); - llama_free(ctx); - llama_free_model(model); - return false; - } + return true; } - n_ctx = llama_n_ctx(ctx); + void initialize() { + id_gen = 0; - return true; - } + // create slots + all_slots_are_idle = true; - void initialize() { - id_gen = 0; + const int32_t n_ctx_slot = n_ctx / params.n_parallel; - // create slots - all_slots_are_idle = true; + LOG_TEE("Available slots:\n"); + for (int i = 0; i < params.n_parallel; i++) + { + llama_client_slot slot; - const int32_t n_ctx_slot = n_ctx / params.n_parallel; + slot.id = i; + slot.n_ctx = n_ctx_slot; + slot.reset(); - LOG_TEE("Available slots:\n"); - for (int i = 0; i < params.n_parallel; i++) { - llama_client_slot slot; + LOG_TEE(" -> Slot %i - max context: %i\n", slot.id, n_ctx_slot); + slots.push_back(slot); + } - slot.id = i; - slot.n_ctx = n_ctx_slot; - slot.reset(); + batch = llama_batch_init(n_ctx, 0, params.n_parallel); - LOG_TEE(" -> Slot %i - max context: %i\n", slot.id, n_ctx_slot); - slots.push_back(slot); + // empty system prompt + system_prompt = ""; + system_tokens.clear(); } - batch = llama_batch_init(n_ctx, 0, params.n_parallel); + std::vector tokenize(const json & json_prompt, bool add_bos) const + { + // If `add_bos` is true, we only add BOS, when json_prompt is a string, + // or the first element of the json_prompt array is a string. + std::vector prompt_tokens; - // empty system prompt - system_prompt = ""; - system_tokens.clear(); - } + if (json_prompt.is_array()) + { + bool first = true; + for (const auto& p : json_prompt) + { + if (p.is_string()) + { + auto s = p.template get(); + std::vector p; + if (first) + { + p = ::llama_tokenize(ctx, s, add_bos); + first = false; + } + else + { + p = ::llama_tokenize(ctx, s, false); + } + prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end()); + } + else + { + if (first) + { + first = false; + } + prompt_tokens.push_back(p.template get()); + } + } + } + else + { + auto s = json_prompt.template get(); + prompt_tokens = ::llama_tokenize(ctx, s, add_bos); + } - std::vector tokenize(const json &json_prompt, - bool add_bos) const { - // If `add_bos` is true, we only add BOS, when json_prompt is a string, - // or the first element of the json_prompt array is a string. - std::vector prompt_tokens; - - if (json_prompt.is_array()) { - bool first = true; - for (const auto &p : json_prompt) { - if (p.is_string()) { - auto s = p.template get(); - std::vector p; - if (first) { - p = ::llama_tokenize(ctx, s, add_bos); - first = false; - } else { - p = ::llama_tokenize(ctx, s, false); - } - prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end()); - } else { - if (first) { - first = false; - } - prompt_tokens.push_back(p.template get()); - } - } - } else { - auto s = json_prompt.template get(); - prompt_tokens = ::llama_tokenize(ctx, s, add_bos); + return prompt_tokens; } - return prompt_tokens; - } + llama_client_slot* get_slot(int id) { + int64_t t_last = ggml_time_us(); + llama_client_slot *last_used = nullptr; - llama_client_slot *get_slot(int id) { - int64_t t_last = ggml_time_us(); - llama_client_slot *last_used = nullptr; + for (llama_client_slot & slot : slots) + { + if (slot.id == id && slot.available()) + { + return &slot; + } - for (llama_client_slot &slot : slots) { - if (slot.id == id && slot.available()) { - return &slot; - } + if (slot.available() && slot.t_last_used < t_last) + { + last_used = &slot; + t_last = slot.t_last_used; + } + } - if (slot.available() && slot.t_last_used < t_last) { - last_used = &slot; - t_last = slot.t_last_used; - } + return last_used; } - return last_used; - } - - bool launch_slot_with_data(llama_client_slot *&slot, json data) { - slot_params default_params; - llama_sampling_params default_sparams; - - slot->params.stream = json_value(data, "stream", false); - slot->params.cache_prompt = json_value(data, "cache_prompt", false); - slot->params.n_predict = - json_value(data, "n_predict", default_params.n_predict); - slot->sparams.top_k = json_value(data, "top_k", default_sparams.top_k); - slot->sparams.top_p = json_value(data, "top_p", default_sparams.top_p); - slot->sparams.tfs_z = json_value(data, "tfs_z", default_sparams.tfs_z); - slot->sparams.typical_p = - json_value(data, "typical_p", default_sparams.typical_p); - slot->sparams.temp = json_value(data, "temperature", default_sparams.temp); - slot->sparams.penalty_last_n = - json_value(data, "repeat_last_n", default_sparams.penalty_last_n); - slot->sparams.penalty_repeat = - json_value(data, "repeat_penalty", default_sparams.penalty_repeat); - slot->sparams.penalty_freq = - json_value(data, "frequency_penalty", default_sparams.penalty_freq); - slot->sparams.penalty_present = - json_value(data, "presence_penalty", default_sparams.penalty_present); - slot->sparams.mirostat = - json_value(data, "mirostat", default_sparams.mirostat); - slot->sparams.mirostat_tau = - json_value(data, "mirostat_tau", default_sparams.mirostat_tau); - slot->sparams.mirostat_eta = - json_value(data, "mirostat_eta", default_sparams.mirostat_eta); - slot->sparams.penalize_nl = - json_value(data, "penalize_nl", default_sparams.penalize_nl); - slot->params.n_keep = json_value(data, "n_keep", slot->params.n_keep); - slot->params.seed = json_value(data, "seed", default_params.seed); - slot->sparams.grammar = - json_value(data, "grammar", default_sparams.grammar); - slot->sparams.n_probs = - json_value(data, "n_probs", default_sparams.n_probs); - - // infill - if (data.count("input_prefix") != 0) { - slot->params.input_prefix = data["input_prefix"]; - } else { - slot->params.input_prefix = ""; - } + bool launch_slot_with_data(llama_client_slot* &slot, json data) { + slot_params default_params; + llama_sampling_params default_sparams; + + slot->params.stream = json_value(data, "stream", false); + slot->params.cache_prompt = json_value(data, "cache_prompt", false); + slot->params.n_predict = json_value(data, "n_predict", default_params.n_predict); + slot->sparams.top_k = json_value(data, "top_k", default_sparams.top_k); + slot->sparams.top_p = json_value(data, "top_p", default_sparams.top_p); + slot->sparams.tfs_z = json_value(data, "tfs_z", default_sparams.tfs_z); + slot->sparams.typical_p = json_value(data, "typical_p", default_sparams.typical_p); + slot->sparams.temp = json_value(data, "temperature", default_sparams.temp); + slot->sparams.penalty_last_n = json_value(data, "repeat_last_n", default_sparams.penalty_last_n); + slot->sparams.penalty_repeat = json_value(data, "repeat_penalty", default_sparams.penalty_repeat); + slot->sparams.penalty_freq = json_value(data, "frequency_penalty", default_sparams.penalty_freq); + slot->sparams.penalty_present = json_value(data, "presence_penalty", default_sparams.penalty_present); + slot->sparams.mirostat = json_value(data, "mirostat", default_sparams.mirostat); + slot->sparams.mirostat_tau = json_value(data, "mirostat_tau", default_sparams.mirostat_tau); + slot->sparams.mirostat_eta = json_value(data, "mirostat_eta", default_sparams.mirostat_eta); + slot->sparams.penalize_nl = json_value(data, "penalize_nl", default_sparams.penalize_nl); + slot->params.n_keep = json_value(data, "n_keep", slot->params.n_keep); + slot->params.seed = json_value(data, "seed", default_params.seed); + slot->sparams.grammar = json_value(data, "grammar", default_sparams.grammar); + slot->sparams.n_probs = json_value(data, "n_probs", default_sparams.n_probs); + + // infill + if (data.count("input_prefix") != 0) + { + slot->params.input_prefix = data["input_prefix"]; + } + else + { + slot->params.input_prefix = ""; + } - if (data.count("input_suffix") != 0) { - slot->params.input_suffix = data["input_suffix"]; - } else { - slot->params.input_suffix = ""; - } + if (data.count("input_suffix") != 0) + { + slot->params.input_suffix = data["input_suffix"]; + } + else + { + slot->params.input_suffix = ""; + } - if (data.count("prompt") != 0) { - slot->prompt = data["prompt"]; - } else { - slot->prompt = ""; - } + if (data.count("prompt") != 0) + { + slot->prompt = data["prompt"]; + } + else + { + slot->prompt = ""; + } - slot->sparams.logit_bias.clear(); + slot->sparams.logit_bias.clear(); - if (json_value(data, "ignore_eos", false)) { - slot->sparams.logit_bias[llama_token_eos(model)] = -INFINITY; - } + if (json_value(data, "ignore_eos", false)) + { + slot->sparams.logit_bias[llama_token_eos(model)] = -INFINITY; + } - const auto &logit_bias = data.find("logit_bias"); - if (logit_bias != data.end() && logit_bias->is_array()) { - const int n_vocab = llama_n_vocab(model); - for (const auto &el : *logit_bias) { - if (el.is_array() && el.size() == 2 && el[0].is_number_integer()) { - llama_token tok = el[0].get(); - if (tok >= 0 && tok < n_vocab) { - if (el[1].is_number()) { - slot->sparams.logit_bias[tok] = el[1].get(); - } else if (el[1].is_boolean() && !el[1].get()) { - slot->sparams.logit_bias[tok] = -INFINITY; - } - } - } - } - } + const auto &logit_bias = data.find("logit_bias"); + if (logit_bias != data.end() && logit_bias->is_array()) + { + const int n_vocab = llama_n_vocab(model); + for (const auto &el : *logit_bias) + { + if (el.is_array() && el.size() == 2 && el[0].is_number_integer()) + { + llama_token tok = el[0].get(); + if (tok >= 0 && tok < n_vocab) + { + if (el[1].is_number()) + { + slot->sparams.logit_bias[tok] = el[1].get(); + } + else if (el[1].is_boolean() && !el[1].get()) + { + slot->sparams.logit_bias[tok] = -INFINITY; + } + } + } + } + } - slot->params.antiprompt.clear(); + slot->params.antiprompt.clear(); - const auto &stop = data.find("stop"); - if (stop != data.end() && stop->is_array()) { - for (const auto &word : *stop) { - if (!word.empty()) { - slot->params.antiprompt.push_back(word); + const auto &stop = data.find("stop"); + if (stop != data.end() && stop->is_array()) + { + for (const auto &word : *stop) + { + if (!word.empty()) + { + slot->params.antiprompt.push_back(word); + } + } } - } - } - if (multimodal) { - const auto &images_data = data.find("image_data"); - if (images_data != data.end() && images_data->is_array()) { - for (const auto &img : *images_data) { - std::string data_b64 = img["data"].get(); - slot_image img_sl; - img_sl.id = - img.count("id") != 0 ? img["id"].get() : slot->images.size(); - int width, height, channels; - std::vector image_buffer = base64_decode(data_b64); - data_b64.clear(); - auto data = - stbi_load_from_memory(image_buffer.data(), image_buffer.size(), - &width, &height, &channels, 3); - if (!data) { - LOG_TEE("slot %i - failed to load image [id: %i]\n", slot->id, - img_sl.id); - return false; - } - LOG_TEE("slot %i - image loaded [id: %i] resolution (%i x %i)\n", - slot->id, img_sl.id, width, height); - img_sl.img_data.nx = width; - img_sl.img_data.ny = height; - img_sl.img_data.size = width * height * 3; - img_sl.img_data.data = new uint8_t[width * height * 3](); - memcpy(img_sl.img_data.data, data, width * height * 3); - stbi_image_free(data); - img_sl.request_encode_image = true; - slot->images.push_back(img_sl); - } - // process prompt - // example: system prompt [img-102] user [img-103] describe [img-134] -> - // [{id: 102, prefix: 'system prompt '}, {id: 103, prefix: ' user '}, - // {id: 134, prefix: ' describe '}]} - if (slot->images.size() > 0 && !slot->prompt.is_array()) { - std::string prompt = slot->prompt.get(); - size_t pos = 0, begin_prefix = 0; - std::string pattern = "[img-"; - while ((pos = prompt.find(pattern, pos)) != std::string::npos) { - size_t end_prefix = pos; - pos += pattern.length(); - size_t end_pos = prompt.find("]", pos); - if (end_pos != std::string::npos) { - std::string image_id = prompt.substr(pos, end_pos - pos); - try { - int img_id = std::stoi(image_id); - bool found = false; - for (slot_image &img : slot->images) { - if (img.id == img_id) { - found = true; - img.prefix_prompt = - prompt.substr(begin_prefix, end_prefix - begin_prefix); - begin_prefix = end_pos + 1; - break; - } + if (multimodal) + { + const auto &images_data = data.find("image_data"); + if (images_data != data.end() && images_data->is_array()) + { + for (const auto &img : *images_data) + { + std::string data_b64 = img["data"].get(); + slot_image img_sl; + img_sl.id = img.count("id") != 0 ? img["id"].get() : slot->images.size(); + int width, height, channels; + std::vector image_buffer = base64_decode(data_b64); + data_b64.clear(); + auto data = stbi_load_from_memory(image_buffer.data(), image_buffer.size(), &width, &height, &channels, 3); + if (!data) { + LOG_TEE("slot %i - failed to load image [id: %i]\n", slot->id, img_sl.id); + return false; + } + LOG_TEE("slot %i - image loaded [id: %i] resolution (%i x %i)\n", slot->id, img_sl.id, width, height); + img_sl.img_data.nx = width; + img_sl.img_data.ny = height; + img_sl.img_data.size = width * height * 3; + img_sl.img_data.data = new uint8_t[width * height * 3](); + memcpy(img_sl.img_data.data, data, width * height * 3); + stbi_image_free(data); + img_sl.request_encode_image = true; + slot->images.push_back(img_sl); } - if (!found) { - LOG_TEE("ERROR: Image with id: %i, not found.\n", img_id); - slot->images.clear(); - return false; + // process prompt + // example: system prompt [img-102] user [img-103] describe [img-134] -> [{id: 102, prefix: 'system prompt '}, {id: 103, prefix: ' user '}, {id: 134, prefix: ' describe '}]} + if (slot->images.size() > 0 && !slot->prompt.is_array()) + { + std::string prompt = slot->prompt.get(); + size_t pos = 0, begin_prefix = 0; + std::string pattern = "[img-"; + while ((pos = prompt.find(pattern, pos)) != std::string::npos) { + size_t end_prefix = pos; + pos += pattern.length(); + size_t end_pos = prompt.find("]", pos); + if (end_pos != std::string::npos) + { + std::string image_id = prompt.substr(pos, end_pos - pos); + try + { + int img_id = std::stoi(image_id); + bool found = false; + for (slot_image &img : slot->images) + { + if (img.id == img_id) { + found = true; + img.prefix_prompt = prompt.substr(begin_prefix, end_prefix - begin_prefix); + begin_prefix = end_pos + 1; + break; + } + } + if (!found) { + LOG_TEE("ERROR: Image with id: %i, not found.\n", img_id); + slot->images.clear(); + return false; + } + } catch (const std::invalid_argument& e) { + LOG_TEE("Invalid image number id in prompt\n"); + slot->images.clear(); + return false; + } + } + } + slot->prompt = ""; + slot->params.input_suffix = prompt.substr(begin_prefix); + slot->params.cache_prompt = false; // multimodal doesn't support cache prompt } - } catch (const std::invalid_argument &e) { - LOG_TEE("Invalid image number id in prompt\n"); - slot->images.clear(); - return false; - } } - } - slot->prompt = ""; - slot->params.input_suffix = prompt.substr(begin_prefix); - slot->params.cache_prompt = - false; // multimodal doesn't support cache prompt } - } + + if (slot->ctx_sampling != nullptr) + { + llama_sampling_free(slot->ctx_sampling); + } + slot->ctx_sampling = llama_sampling_init(slot->sparams); + slot->command = LOAD_PROMPT; + + all_slots_are_idle = false; + + LOG_TEE("slot %i is processing [task id: %i]\n", slot->id, slot->task_id); + + return true; } - if (slot->ctx_sampling != nullptr) { - llama_sampling_free(slot->ctx_sampling); + void kv_cache_clear() { + // clear the entire KV cache + llama_kv_cache_clear(ctx); + clean_kv_cache = false; } - slot->ctx_sampling = llama_sampling_init(slot->sparams); - slot->command = LOAD_PROMPT; - all_slots_are_idle = false; + void update_system_prompt() { + system_tokens = ::llama_tokenize(ctx, system_prompt, true); - LOG_TEE("slot %i is processing [task id: %i]\n", slot->id, slot->task_id); + llama_batch_clear(batch); - return true; - } - - void kv_cache_clear() { - // clear the entire KV cache - llama_kv_cache_clear(ctx); - clean_kv_cache = false; - } + kv_cache_clear(); - void update_system_prompt() { - system_tokens = ::llama_tokenize(ctx, system_prompt, true); + for (int i = 0; i < (int) system_tokens.size(); ++i) + { + llama_batch_add(batch, system_tokens[i], i, { 0 }, false); + } - llama_batch_clear(batch); + if (llama_decode(ctx, batch) != 0) + { + LOG_TEE("%s: llama_decode() failed\n", __func__); + return; + } - kv_cache_clear(); + // assign the system KV cache to all parallel sequences + for (int32_t i = 1; i < params.n_parallel; ++i) + { + llama_kv_cache_seq_cp(ctx, 0, i, 0, system_tokens.size()); + } - for (int i = 0; i < (int)system_tokens.size(); ++i) { - llama_batch_add(batch, system_tokens[i], i, {0}, false); + LOG_TEE("system prompt updated\n"); + system_need_update = false; } - if (llama_decode(ctx, batch) != 0) { - LOG_TEE("%s: llama_decode() failed\n", __func__); - return; - } + void notify_system_prompt_changed() { + // release all slots + for (llama_client_slot &slot : slots) + { + slot.release(); + } - // assign the system KV cache to all parallel sequences - for (int32_t i = 1; i < params.n_parallel; ++i) { - llama_kv_cache_seq_cp(ctx, 0, i, 0, system_tokens.size()); + system_need_update = true; } - LOG_TEE("system prompt updated\n"); - system_need_update = false; - } + void process_system_prompt_data(const json &sys_props) { + system_prompt = sys_props.value("prompt", ""); + name_user = sys_props.value("anti_prompt", ""); + name_assistant = sys_props.value("assistant_name", ""); - void notify_system_prompt_changed() { - // release all slots - for (llama_client_slot &slot : slots) { - slot.release(); + if (slots.size() > 0) + { + notify_system_prompt_changed(); + } } - system_need_update = true; - } + static size_t find_stopping_strings(const std::string &text, const size_t last_token_size, + const stop_type type, llama_client_slot &slot) + { + size_t stop_pos = std::string::npos; - void process_system_prompt_data(const json &sys_props) { - system_prompt = sys_props.value("prompt", ""); - name_user = sys_props.value("anti_prompt", ""); - name_assistant = sys_props.value("assistant_name", ""); + for (const std::string &word : slot.params.antiprompt) + { + size_t pos; + if (type == STOP_FULL) + { + const size_t tmp = word.size() + last_token_size; + const size_t from_pos = text.size() > tmp ? text.size() - tmp : 0; + pos = text.find(word, from_pos); + } + else + { + pos = find_partial_stop_string(word, text); + } + if (pos != std::string::npos && + (stop_pos == std::string::npos || pos < stop_pos)) + { + if (type == STOP_FULL) + { + slot.stopped_word = true; + slot.stopping_word = word; + slot.has_next_token = false; + } + stop_pos = pos; + } + } - if (slots.size() > 0) { - notify_system_prompt_changed(); + return stop_pos; } - } - static size_t find_stopping_strings(const std::string &text, - const size_t last_token_size, - const stop_type type, - llama_client_slot &slot) { - size_t stop_pos = std::string::npos; - - for (const std::string &word : slot.params.antiprompt) { - size_t pos; - if (type == STOP_FULL) { - const size_t tmp = word.size() + last_token_size; - const size_t from_pos = text.size() > tmp ? text.size() - tmp : 0; - pos = text.find(word, from_pos); - } else { - pos = find_partial_stop_string(word, text); - } - if (pos != std::string::npos && - (stop_pos == std::string::npos || pos < stop_pos)) { - if (type == STOP_FULL) { - slot.stopped_word = true; - slot.stopping_word = word; - slot.has_next_token = false; - } - stop_pos = pos; - } - } + bool process_token(completion_token_output &result, llama_client_slot &slot) { + // remember which tokens were sampled - used for repetition penalties during sampling + const std::string token_str = llama_token_to_piece(ctx, result.tok); + slot.sampled = result.tok; - return stop_pos; - } + // search stop word and delete it + slot.generated_text += token_str; + slot.has_next_token = true; - bool process_token(completion_token_output &result, llama_client_slot &slot) { - // remember which tokens were sampled - used for repetition penalties during - // sampling - const std::string token_str = llama_token_to_piece(ctx, result.tok); - slot.sampled = result.tok; - - // search stop word and delete it - slot.generated_text += token_str; - slot.has_next_token = true; - - if (slot.multibyte_pending > 0) { - slot.multibyte_pending -= token_str.size(); - } else if (token_str.size() == 1) { - const char c = token_str[0]; - // 2-byte characters: 110xxxxx 10xxxxxx - if ((c & 0xE0) == 0xC0) { - slot.multibyte_pending = 1; - // 3-byte characters: 1110xxxx 10xxxxxx 10xxxxxx - } else if ((c & 0xF0) == 0xE0) { - slot.multibyte_pending = 2; - // 4-byte characters: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx - } else if ((c & 0xF8) == 0xF0) { - slot.multibyte_pending = 3; - } else { - slot.multibyte_pending = 0; - } + if (slot.multibyte_pending > 0) + { + slot.multibyte_pending -= token_str.size(); + } + else if (token_str.size() == 1) + { + const char c = token_str[0]; + // 2-byte characters: 110xxxxx 10xxxxxx + if ((c & 0xE0) == 0xC0) + { + slot.multibyte_pending = 1; + // 3-byte characters: 1110xxxx 10xxxxxx 10xxxxxx + } + else if ((c & 0xF0) == 0xE0) + { + slot.multibyte_pending = 2; + // 4-byte characters: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + } + else if ((c & 0xF8) == 0xF0) + { + slot.multibyte_pending = 3; + } + else + { + slot.multibyte_pending = 0; + } + } + + if (slot.multibyte_pending == 0) + { + size_t pos = std::min(slot.sent_count, slot.generated_text.size()); + const std::string str_test = slot.generated_text.substr(pos); + bool is_stop_full = false; + size_t stop_pos = find_stopping_strings(str_test, token_str.size(), STOP_FULL, slot); + if (stop_pos != std::string::npos) + { + is_stop_full = true; + slot.generated_text.erase( + slot.generated_text.begin() + pos + stop_pos, + slot.generated_text.end()); + pos = std::min(slot.sent_count, slot.generated_text.size()); + } + else + { + is_stop_full = false; + stop_pos = find_stopping_strings(str_test, token_str.size(), STOP_PARTIAL, slot); + } + + // check if there is any token to predict + if (stop_pos == std::string::npos || (!slot.has_next_token && !is_stop_full && stop_pos > 0)) + { + // no send the stop word in the response + result.text_to_send = slot.generated_text.substr(pos, std::string::npos); + slot.sent_count += result.text_to_send.size(); + // add the token to slot queue and cache + } + slot.add_token_string(result); + if (slot.params.stream) + { + send_partial_response(slot, result); + } + } + + if (slot.multibyte_pending > 0 && !slot.has_next_token) + { + slot.has_next_token = true; + } + + // check the limits + if (slot.n_decoded > 2 && slot.has_next_token && !slot.has_budget(params)) + { + slot.stopped_limit = true; + slot.has_next_token = false; + } + + if (!slot.cache_tokens.empty() && result.tok == llama_token_eos(model)) + { + slot.stopped_eos = true; + slot.has_next_token = false; + LOG_VERBOSE("eos token found", {}); + } + + LOG_VERBOSE("next token", { + {"token", result.tok}, + {"token_text", tokens_to_output_formatted_string(ctx, result.tok)}, + {"has_next_token", slot.has_next_token}, + {"n_remain", slot.n_remaining}, + {"num_tokens_predicted", slot.n_decoded}, + {"stopped_eos", slot.stopped_eos}, + {"stopped_word", slot.stopped_word}, + {"stopped_limit", slot.stopped_limit}, + {"stopping_word", slot.stopping_word}, + }); + + return slot.has_next_token; // continue } - if (slot.multibyte_pending == 0) { - size_t pos = std::min(slot.sent_count, slot.generated_text.size()); - const std::string str_test = slot.generated_text.substr(pos); - bool is_stop_full = false; - size_t stop_pos = - find_stopping_strings(str_test, token_str.size(), STOP_FULL, slot); - if (stop_pos != std::string::npos) { - is_stop_full = true; - slot.generated_text.erase(slot.generated_text.begin() + pos + stop_pos, - slot.generated_text.end()); - pos = std::min(slot.sent_count, slot.generated_text.size()); - } else { - is_stop_full = false; - stop_pos = find_stopping_strings(str_test, token_str.size(), - STOP_PARTIAL, slot); - } - - // check if there is any token to predict - if (stop_pos == std::string::npos || - (!slot.has_next_token && !is_stop_full && stop_pos > 0)) { - // no send the stop word in the response - result.text_to_send = - slot.generated_text.substr(pos, std::string::npos); - slot.sent_count += result.text_to_send.size(); - // add the token to slot queue and cache - } - slot.add_token_string(result); - if (slot.params.stream) { - send_partial_response(slot, result); - } + bool process_images(llama_client_slot &slot) const + { + for (slot_image &img : slot.images) + { + if (!img.request_encode_image) + { + continue; + } + clip_image_f32 img_res; + if (!clip_image_preprocess(clp_ctx, &img.img_data, &img_res, /*pad2square =*/ true)) + { + LOG_TEE("Error processing the given image"); + clip_free(clp_ctx); + return false; + } + img.image_tokens = clip_n_patches(clp_ctx); + img.image_embedding = (float *)malloc(clip_embd_nbytes(clp_ctx)); + if (!img.image_embedding) + { + LOG_TEE("Unable to allocate memory for image embeddings\n"); + clip_free(clp_ctx); + return false; + } + LOG_TEE("slot %i - encoding image [id: %i]\n", slot.id, img.id); + if (!clip_image_encode(clp_ctx, params.n_threads, &img_res, img.image_embedding)) + { + LOG_TEE("Unable to encode image\n"); + return false; + } + img.request_encode_image = false; + } + + return slot.images.size() > 0; } - if (slot.multibyte_pending > 0 && !slot.has_next_token) { - slot.has_next_token = true; + void send_error(int id, std::string error) + { + std::lock_guard lock(mutex_results); + task_result res; + res.id = id; + res.error = true; + res.result_json = { { "content", error } }; + queue_results.push_back(res); } - // check the limits - if (slot.n_decoded > 2 && slot.has_next_token && !slot.has_budget(params)) { - slot.stopped_limit = true; - slot.has_next_token = false; + json get_model_props() + { + return get_formated_generation(slots[0]); } - if (!slot.cache_tokens.empty() && result.tok == llama_token_eos(model)) { - slot.stopped_eos = true; - slot.has_next_token = false; - LOG_VERBOSE("eos token found", {}); + json get_formated_generation(llama_client_slot &slot) + { + const auto eos_bias = slot.sparams.logit_bias.find(llama_token_eos(model)); + const bool ignore_eos = eos_bias != slot.sparams.logit_bias.end() && + eos_bias->second < 0.0f && std::isinf(eos_bias->second); + return json { + {"n_ctx", slot.n_ctx}, + {"model", params.model_alias}, + {"seed", slot.params.seed}, + {"temp", slot.sparams.temp}, + {"top_k", slot.sparams.top_k}, + {"top_p", slot.sparams.top_p}, + {"tfs_z", slot.sparams.tfs_z}, + {"typical_p", slot.sparams.typical_p}, + {"repeat_last_n", slot.sparams.penalty_last_n}, + {"repeat_penalty", slot.sparams.penalty_repeat}, + {"presence_penalty", slot.sparams.penalty_present}, + {"frequency_penalty", slot.sparams.penalty_freq}, + {"mirostat", slot.sparams.mirostat}, + {"mirostat_tau", slot.sparams.mirostat_tau}, + {"mirostat_eta", slot.sparams.mirostat_eta}, + {"penalize_nl", slot.sparams.penalize_nl}, + {"stop", slot.params.antiprompt}, + {"n_predict", slot.params.n_predict}, + {"n_keep", params.n_keep}, + {"ignore_eos", ignore_eos}, + {"stream", slot.params.stream}, + {"logit_bias", slot.sparams.logit_bias}, + {"n_probs", slot.sparams.n_probs}, + {"grammar", slot.sparams.grammar}, + }; } - LOG_VERBOSE( - "next token", - { - {"token", result.tok}, - {"token_text", tokens_to_output_formatted_string(ctx, result.tok)}, - {"has_next_token", slot.has_next_token}, - {"n_remain", slot.n_remaining}, - {"num_tokens_predicted", slot.n_decoded}, - {"stopped_eos", slot.stopped_eos}, - {"stopped_word", slot.stopped_word}, - {"stopped_limit", slot.stopped_limit}, - {"stopping_word", slot.stopping_word}, - }); + void send_partial_response(llama_client_slot &slot, completion_token_output tkn) + { + std::lock_guard lock(mutex_results); + task_result res; + res.id = slot.task_id; + res.error = false; + res.stop = false; - return slot.has_next_token; // continue - } + res.result_json = json + { + {"content", tkn.text_to_send}, + {"stop", false}, + {"slot_id", slot.id}, + {"multimodal", multimodal} + }; - bool process_images(llama_client_slot &slot) const { - for (slot_image &img : slot.images) { - if (!img.request_encode_image) { - continue; - } - clip_image_f32 img_res; - if (!clip_image_preprocess(clp_ctx, &img.img_data, &img_res, - /*pad2square =*/true)) { - LOG_TEE("Error processing the given image"); - clip_free(clp_ctx); - return false; - } - img.image_tokens = clip_n_patches(clp_ctx); - img.image_embedding = (float *)malloc(clip_embd_nbytes(clp_ctx)); - if (!img.image_embedding) { - LOG_TEE("Unable to allocate memory for image embeddings\n"); - clip_free(clp_ctx); - return false; - } - LOG_TEE("slot %i - encoding image [id: %i]\n", slot.id, img.id); - if (!clip_image_encode(clp_ctx, params.n_threads, &img_res, - img.image_embedding)) { - LOG_TEE("Unable to encode image\n"); - return false; - } - img.request_encode_image = false; - } + if (slot.sparams.n_probs > 0) + { + std::vector probs_output = {}; + const std::vector to_send_toks = llama_tokenize(ctx, tkn.text_to_send, false); + size_t probs_pos = std::min(slot.sent_token_probs_index, slot.generated_token_probs.size()); + size_t probs_stop_pos = std::min(slot.sent_token_probs_index + to_send_toks.size(), slot.generated_token_probs.size()); + if (probs_pos < probs_stop_pos) + { + probs_output = std::vector(slot.generated_token_probs.begin() + probs_pos, slot.generated_token_probs.begin() + probs_stop_pos); + } + slot.sent_token_probs_index = probs_stop_pos; + res.result_json["completion_probabilities"] = probs_vector_to_json(ctx, probs_output); + } - return slot.images.size() > 0; - } + queue_results.push_back(res); + } - void send_error(int id, std::string error) { - std::lock_guard lock(mutex_results); - task_result res; - res.id = id; - res.error = true; - res.result_json = {{"content", error}}; - queue_results.push_back(res); - } + void send_final_response(llama_client_slot &slot) + { + std::lock_guard lock(mutex_results); + task_result res; + res.id = slot.task_id; + res.error = false; + res.stop = true; - json get_model_props() { return get_formated_generation(slots[0]); } + res.result_json = json + { + {"content", !slot.params.stream ? slot.generated_text : ""}, + {"slot_id", slot.id}, + {"stop", true}, + {"model", params.model_alias}, + {"tokens_predicted", slot.n_decoded}, + {"tokens_evaluated", slot.num_prompt_tokens}, + {"generation_settings", get_formated_generation(slot)}, + {"prompt", slot.prompt}, + {"truncated", slot.truncated}, + {"stopped_eos", slot.stopped_eos}, + {"stopped_word", slot.stopped_word}, + {"stopped_limit", slot.stopped_limit}, + {"stopping_word", slot.stopping_word}, + {"tokens_cached", slot.n_past}, + {"timings", slot.get_formated_timings()} + }; - json get_formated_generation(llama_client_slot &slot) { - const auto eos_bias = slot.sparams.logit_bias.find(llama_token_eos(model)); - const bool ignore_eos = eos_bias != slot.sparams.logit_bias.end() && - eos_bias->second < 0.0f && - std::isinf(eos_bias->second); - return json{ - {"n_ctx", slot.n_ctx}, - {"model", params.model_alias}, - {"seed", slot.params.seed}, - {"temp", slot.sparams.temp}, - {"top_k", slot.sparams.top_k}, - {"top_p", slot.sparams.top_p}, - {"tfs_z", slot.sparams.tfs_z}, - {"typical_p", slot.sparams.typical_p}, - {"repeat_last_n", slot.sparams.penalty_last_n}, - {"repeat_penalty", slot.sparams.penalty_repeat}, - {"presence_penalty", slot.sparams.penalty_present}, - {"frequency_penalty", slot.sparams.penalty_freq}, - {"mirostat", slot.sparams.mirostat}, - {"mirostat_tau", slot.sparams.mirostat_tau}, - {"mirostat_eta", slot.sparams.mirostat_eta}, - {"penalize_nl", slot.sparams.penalize_nl}, - {"stop", slot.params.antiprompt}, - {"n_predict", slot.params.n_predict}, - {"n_keep", params.n_keep}, - {"ignore_eos", ignore_eos}, - {"stream", slot.params.stream}, - {"logit_bias", slot.sparams.logit_bias}, - {"n_probs", slot.sparams.n_probs}, - {"grammar", slot.sparams.grammar}, - }; - } + if (slot.sparams.n_probs > 0) + { + std::vector probs = {}; + if (!slot.params.stream && slot.stopped_word) + { + const std::vector stop_word_toks = llama_tokenize(ctx, slot.stopping_word, false); + probs = std::vector(slot.generated_token_probs.begin(), slot.generated_token_probs.end() - stop_word_toks.size()); + } + else + { + probs = std::vector( + slot.generated_token_probs.begin(), + slot.generated_token_probs.begin() + slot.sent_token_probs_index); + } + res.result_json["completion_probabilities"] = probs_vector_to_json(ctx, probs); + } - void send_partial_response(llama_client_slot &slot, - completion_token_output tkn) { - std::lock_guard lock(mutex_results); - task_result res; - res.id = slot.task_id; - res.error = false; - res.stop = false; - - res.result_json = json{{"content", tkn.text_to_send}, - {"stop", false}, - {"slot_id", slot.id}, - {"multimodal", multimodal}}; - - if (slot.sparams.n_probs > 0) { - std::vector probs_output = {}; - const std::vector to_send_toks = - llama_tokenize(ctx, tkn.text_to_send, false); - size_t probs_pos = std::min(slot.sent_token_probs_index, - slot.generated_token_probs.size()); - size_t probs_stop_pos = - std::min(slot.sent_token_probs_index + to_send_toks.size(), - slot.generated_token_probs.size()); - if (probs_pos < probs_stop_pos) { - probs_output = std::vector( - slot.generated_token_probs.begin() + probs_pos, - slot.generated_token_probs.begin() + probs_stop_pos); - } - slot.sent_token_probs_index = probs_stop_pos; - res.result_json["completion_probabilities"] = - probs_vector_to_json(ctx, probs_output); + queue_results.push_back(res); } - queue_results.push_back(res); - } + void send_embedding(llama_client_slot &slot) + { + std::lock_guard lock(mutex_results); + task_result res; + res.id = slot.task_id; + res.error = false; + res.stop = true; - void send_final_response(llama_client_slot &slot) { - std::lock_guard lock(mutex_results); - task_result res; - res.id = slot.task_id; - res.error = false; - res.stop = true; - - res.result_json = - json{{"content", !slot.params.stream ? slot.generated_text : ""}, - {"slot_id", slot.id}, - {"stop", true}, - {"model", params.model_alias}, - {"tokens_predicted", slot.n_decoded}, - {"tokens_evaluated", slot.num_prompt_tokens}, - {"generation_settings", get_formated_generation(slot)}, - {"prompt", slot.prompt}, - {"truncated", slot.truncated}, - {"stopped_eos", slot.stopped_eos}, - {"stopped_word", slot.stopped_word}, - {"stopped_limit", slot.stopped_limit}, - {"stopping_word", slot.stopping_word}, - {"tokens_cached", slot.n_past}, - {"timings", slot.get_formated_timings()}}; - - if (slot.sparams.n_probs > 0) { - std::vector probs = {}; - if (!slot.params.stream && slot.stopped_word) { - const std::vector stop_word_toks = - llama_tokenize(ctx, slot.stopping_word, false); - probs = std::vector( - slot.generated_token_probs.begin(), - slot.generated_token_probs.end() - stop_word_toks.size()); - } else { - probs = std::vector( - slot.generated_token_probs.begin(), - slot.generated_token_probs.begin() + slot.sent_token_probs_index); - } - res.result_json["completion_probabilities"] = - probs_vector_to_json(ctx, probs); + const int n_embd = llama_n_embd(model); + if (!params.embedding) + { + LOG_WARNING_LLAMA("embedding disabled", { + {"params.embedding", params.embedding}, + }); + res.result_json = json + { + {"embedding", std::vector(n_embd, 0.0f)}, + }; + } + else + { + const float *data = llama_get_embeddings(ctx); + std::vector embedding(data, data + n_embd); + res.result_json = json + { + {"embedding", embedding }, + }; + } + queue_results.push_back(res); } - queue_results.push_back(res); - } + int request_completion(json data, bool infill, bool embedding) + { + std::lock_guard lock(mutex_tasks); + task_server task; + task.id = id_gen++; + task.data = data; + task.infill_mode = infill; + task.embedding_mode = embedding; + task.type = COMPLETION_TASK; + queue_tasks.push_back(task); + return task.id; + } - void send_embedding(llama_client_slot &slot) { - std::lock_guard lock(mutex_results); - task_result res; - res.id = slot.task_id; - res.error = false; - res.stop = true; + task_result next_result(int task_id) + { + while (true) + { + std::this_thread::sleep_for(std::chrono::microseconds(5)); + std::lock_guard lock(mutex_results); - const int n_embd = llama_n_embd(model); - if (!params.embedding) { - LOG_WARNING_LLAMA("embedding disabled", - { - {"params.embedding", params.embedding}, - }); - res.result_json = json{ - {"embedding", std::vector(n_embd, 0.0f)}, - }; - } else { - const float *data = llama_get_embeddings(ctx); - std::vector embedding(data, data + n_embd); - res.result_json = json{ - {"embedding", embedding}, - }; + if (queue_results.empty()) + { + continue; + } + + for (int i = 0; i < (int) queue_results.size(); i++) + { + if (queue_results[i].id == task_id) + { + task_result res = queue_results[i]; + queue_results.erase(queue_results.begin() + i); + return res; + } + } + } + + // never reached + //return task_result{-1, false, false, {}}; } - queue_results.push_back(res); - } - int request_completion(json data, bool infill) { - std::lock_guard lock(mutex_tasks); - task_server task; - task.id = id_gen++; - task.data = data; - task.infill_mode = infill; - task.type = COMPLETION_TASK; - queue_tasks.push_back(task); - return task.id; - } + // for multiple images processing + bool ingest_images(llama_client_slot &slot, int n_batch) + { + int image_idx = 0; + + while (image_idx < (int) slot.images.size()) + { + slot_image &img = slot.images[image_idx]; + + // process prefix prompt + for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) + { + const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i)); + llama_batch batch_view = { + n_tokens, + batch.token + i, + nullptr, + batch.pos + i, + batch.n_seq_id + i, + batch.seq_id + i, + batch.logits + i, + 0, 0, 0, // unused + }; + if (llama_decode(ctx, batch_view)) + { + LOG_TEE("%s : failed to eval\n", __func__); + return false; + } + } - task_result next_result(int task_id) { - while (true) { - std::this_thread::sleep_for(std::chrono::microseconds(5)); - std::lock_guard lock(mutex_results); + // process image with llm + for (int i = 0; i < img.image_tokens; i += n_batch) + { + int n_eval = img.image_tokens - i; + if (n_eval > n_batch) + { + n_eval = n_batch; + } - if (queue_results.empty()) { - continue; - } + const int n_embd = llama_n_embd(model); + llama_batch batch_img = { n_eval, nullptr, (img.image_embedding + i * n_embd), nullptr, nullptr, nullptr, nullptr, slot.n_past, 1, 0, }; + if (llama_decode(ctx, batch_img)) + { + LOG_TEE("%s : failed to eval image\n", __func__); + return false; + } + slot.n_past += n_eval; + } + image_idx++; - for (int i = 0; i < (int)queue_results.size(); i++) { - if (queue_results[i].id == task_id) { - task_result res = queue_results[i]; - queue_results.erase(queue_results.begin() + i); - return res; + llama_batch_clear(batch); + + // append prefix of next image + const auto json_prompt = (image_idx >= (int) slot.images.size()) ? + slot.params.input_suffix : // no more images, then process suffix prompt + (json)(slot.images[image_idx].prefix_prompt); + + std::vector append_tokens = tokenize(json_prompt, false); // has next image + for (int i = 0; i < (int) append_tokens.size(); ++i) + { + llama_batch_add(batch, append_tokens[i], slot.n_past, { slot.id }, true); + slot.n_past += 1; + } } - } + + return true; } - // never reached - // return task_result{-1, false, false, {}}; - } + void request_cancel(int task_id) + { + std::lock_guard lock(mutex_tasks); + task_server task; + task.id = id_gen++; + task.type = CANCEL_TASK; + task.target_id = task_id; + queue_tasks.push_back(task); + } - // for multiple images processing - bool ingest_images(llama_client_slot &slot, int n_batch) { - int image_idx = 0; - - while (image_idx < (int)slot.images.size()) { - slot_image &img = slot.images[image_idx]; - - // process prefix prompt - for (int32_t i = 0; i < (int32_t)batch.n_tokens; i += n_batch) { - const int32_t n_tokens = - std::min(n_batch, (int32_t)(batch.n_tokens - i)); - llama_batch batch_view = { - n_tokens, - batch.token + i, - nullptr, - batch.pos + i, - batch.n_seq_id + i, - batch.seq_id + i, - batch.logits + i, - 0, - 0, - 0, // unused - }; - if (llama_decode(ctx, batch_view)) { - LOG_TEE("%s : failed to eval\n", __func__); - return false; + void process_tasks() + { + std::lock_guard lock(mutex_tasks); + while (!queue_tasks.empty()) + { + task_server task = queue_tasks.front(); + queue_tasks.erase(queue_tasks.begin()); + switch (task.type) + { + case COMPLETION_TASK: { + llama_client_slot *slot = get_slot(json_value(task.data, "slot_id", -1)); + if (slot == nullptr) + { + LOG_TEE("slot unavailable\n"); + // send error result + send_error(task.id, "slot unavailable"); + return; + } + + if (task.data.contains("system_prompt")) + { + process_system_prompt_data(task.data["system_prompt"]); + } + + slot->reset(); + + slot->infill = task.infill_mode; + slot->embedding = task.embedding_mode; + slot->task_id = task.id; + + if (!launch_slot_with_data(slot, task.data)) + { + // send error result + send_error(task.id, "internal_error"); + break; + } + } break; + case CANCEL_TASK: { // release slot linked with the task id + for (auto & slot : slots) + { + if (slot.task_id == task.target_id) + { + slot.release(); + break; + } + } + } break; + } } - } + } - // process image with llm - for (int i = 0; i < img.image_tokens; i += n_batch) { - int n_eval = img.image_tokens - i; - if (n_eval > n_batch) { - n_eval = n_batch; + bool update_slots() { + // attend tasks + process_tasks(); + + // update the system prompt wait until all slots are idle state + if (system_need_update && all_slots_are_idle) + { + LOG_TEE("updating system prompt\n"); + update_system_prompt(); } - const int n_embd = llama_n_embd(model); - llama_batch batch_img = { - n_eval, nullptr, (img.image_embedding + i * n_embd), - nullptr, nullptr, nullptr, - nullptr, slot.n_past, 1, - 0, - }; - if (llama_decode(ctx, batch_img)) { - LOG_TEE("%s : failed to eval image\n", __func__); - return false; - } - slot.n_past += n_eval; - } - image_idx++; - - llama_batch_clear(batch); - - // append prefix of next image - const auto json_prompt = - (image_idx >= (int)slot.images.size()) - ? slot.params.input_suffix - : // no more images, then process suffix prompt - (json)(slot.images[image_idx].prefix_prompt); - - std::vector append_tokens = - tokenize(json_prompt, false); // has next image - for (int i = 0; i < (int)append_tokens.size(); ++i) { - llama_batch_add(batch, append_tokens[i], slot.n_past, {slot.id}, true); - slot.n_past += 1; - } - } + llama_batch_clear(batch); - return true; - } + if (all_slots_are_idle) + { + if (system_prompt.empty() && clean_kv_cache) + { + LOG_TEE("all slots are idle and system prompt is empty, clear the KV cache\n"); + kv_cache_clear(); + } + // avoid 100% usage of cpu all time + std::this_thread::sleep_for(std::chrono::milliseconds(5)); + } - void request_cancel(int task_id) { - std::lock_guard lock(mutex_tasks); - task_server task; - task.id = id_gen++; - task.type = CANCEL_TASK; - task.target_id = task_id; - queue_tasks.push_back(task); - } + for (llama_client_slot &slot : slots) + { + if (slot.is_processing() && slot.cache_tokens.size() >= (size_t) slot.n_ctx) + { + // Shift context + const int n_left = slot.n_past - slot.params.n_keep - 1; + const int n_discard = n_left / 2; + + LOG_TEE("slot %d: context shift - n_keep = %d, n_left = %d, n_discard = %d\n", slot.id, slot.params.n_keep, n_left, n_discard); + llama_kv_cache_seq_rm (ctx, slot.id, slot.params.n_keep + 1 , slot.params.n_keep + n_discard + 1); + llama_kv_cache_seq_shift(ctx, slot.id, slot.params.n_keep + 1 + n_discard, slot.n_past, -n_discard); + + for (size_t i = slot.params.n_keep + 1 + n_discard; i < slot.cache_tokens.size(); i++) + { + slot.cache_tokens[i - n_discard] = slot.cache_tokens[i]; + } - void process_tasks() { - std::lock_guard lock(mutex_tasks); - while (!queue_tasks.empty()) { - // LOG_INFO << "test tasks"; - task_server task = queue_tasks.front(); - queue_tasks.erase(queue_tasks.begin()); - switch (task.type) { - case COMPLETION_TASK: { - llama_client_slot *slot = - get_slot(json_value(task.data, "slot_id", -1)); - if (slot == nullptr) { - LOG_TEE("slot unavailable\n"); - // send error result - send_error(task.id, "slot unavaliable"); - return; - } - - if (task.data.contains("system_prompt")) { - process_system_prompt_data(task.data["system_prompt"]); - } - - slot->reset(); - - slot->infill = task.infill_mode; - slot->task_id = task.id; - - if (!launch_slot_with_data(slot, task.data)) { - // send error result - send_error(task.id, "internal_error"); - break; - } - } break; - case CANCEL_TASK: { // release slot linked with the task id - for (auto &slot : slots) { - if (slot.task_id == task.target_id) { - slot.release(); - slot.print_timings(); - break; - } + slot.cache_tokens.resize(slot.cache_tokens.size() - n_discard); + + slot.n_past -= n_discard; + + slot.truncated = true; + + LOG_VERBOSE("context shift", { + {"n_ctx", n_ctx}, + {"n_keep", params.n_keep}, + {"n_left", n_left}, + }); + } } - } break; - } - } - } - bool update_slots() { - // attend tasks - process_tasks(); + // decode any currently ongoing sequences + for (auto & slot : slots) + { + // release the slot + if (slot.command == RELEASE) + { + slot.state = IDLE; + slot.command = NONE; + slot.t_last_used = ggml_time_us(); - // update the system prompt wait until all slots are idle state - if (system_need_update && all_slots_are_idle) { - LOG_TEE("updating system prompt\n"); - update_system_prompt(); - } + LOG_TEE("slot %d released (%d tokens in cache)\n", slot.id, (int) slot.cache_tokens.size()); - llama_batch_clear(batch); + continue; + } - if (all_slots_are_idle) { - if (system_prompt.empty() && clean_kv_cache) { - LOG_TEE("all slots are idle and system prompt is empty, clear the KV " - "cache\n"); - kv_cache_clear(); - } - // avoid 100% usage of cpu all time - std::this_thread::sleep_for(std::chrono::milliseconds(5)); - } + if (slot.state == IDLE) + { + continue; + } - for (llama_client_slot &slot : slots) { - if (slot.is_processing() && - slot.cache_tokens.size() >= (size_t)slot.n_ctx) { - // Shift context - const int n_left = slot.n_past - slot.params.n_keep - 1; - const int n_discard = n_left / 2; + slot.i_batch = batch.n_tokens; - LOG_TEE("slot %d: context shift - n_keep = %d, n_left = %d, n_discard " - "= %d\n", - slot.id, slot.params.n_keep, n_left, n_discard); - llama_kv_cache_seq_rm(ctx, slot.id, slot.params.n_keep + 1, - slot.params.n_keep + n_discard + 1); - llama_kv_cache_seq_shift(ctx, slot.id, - slot.params.n_keep + 1 + n_discard, - slot.n_past, -n_discard); + llama_batch_add(batch, slot.sampled, system_tokens.size() + slot.n_past, { slot.id }, true); - for (size_t i = slot.params.n_keep + 1 + n_discard; - i < slot.cache_tokens.size(); i++) { - slot.cache_tokens[i - n_discard] = slot.cache_tokens[i]; + slot.n_decoded += 1; + slot.n_past += 1; } - slot.cache_tokens.resize(slot.cache_tokens.size() - n_discard); + // process in chunks of params.n_batch + int32_t n_batch = params.n_batch; - slot.n_past -= n_discard; + // assign workload to the slots + if (params.cont_batching || batch.n_tokens == 0) + { + for (auto & slot : slots) + { + const bool has_prompt = slot.prompt.is_array() || (slot.prompt.is_string() && !slot.prompt.get().empty()) || !slot.images.empty(); + + // empty prompt passed -> release the slot and send empty response + if (slot.state == IDLE && slot.command == LOAD_PROMPT && !has_prompt) + { + slot.release(); + slot.print_timings(); + send_final_response(slot); + continue; + } - slot.truncated = true; + // need process the prompt + if (slot.state == IDLE && slot.command == LOAD_PROMPT) + { + slot.state = PROCESSING; + slot.command = NONE; + std::vector prompt_tokens; + slot.t_start_process_prompt = ggml_time_us(); + slot.t_start_genereration = 0; + + if (slot.infill) + { + bool suff_rm_leading_spc = true; + if (params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) + { + params.input_suffix.erase(0, 1); + suff_rm_leading_spc = false; + } + auto prefix_tokens = tokenize(slot.params.input_prefix, false); + auto suffix_tokens = tokenize(slot.params.input_suffix, false); + + const int space_token = 29871; // TODO: this should not be hardcoded + if (suff_rm_leading_spc && !suffix_tokens.empty() && suffix_tokens[0] == space_token) { + suffix_tokens.erase(suffix_tokens.begin()); + } + + prefix_tokens.insert(prefix_tokens.begin(), llama_token_prefix(model)); + prefix_tokens.insert(prefix_tokens.begin(), llama_token_bos(model)); // always add BOS + prefix_tokens.insert(prefix_tokens.end(), llama_token_suffix(model)); + prefix_tokens.insert(prefix_tokens.end(), suffix_tokens.begin(), suffix_tokens.end()); + prefix_tokens.push_back(llama_token_middle(model)); + prompt_tokens = prefix_tokens; + } + else + { + prompt_tokens = tokenize(slot.prompt, system_prompt.empty()); // add BOS if there isn't system prompt + } + + slot.num_prompt_tokens = prompt_tokens.size(); + + if (!slot.params.cache_prompt) + { + llama_sampling_reset(slot.ctx_sampling); + + slot.n_past = 0; + slot.num_prompt_tokens_processed = slot.num_prompt_tokens; + } + else + { + if (slot.params.n_keep < 0) + { + slot.params.n_keep = slot.num_prompt_tokens; + } + slot.params.n_keep = std::min(slot.n_ctx - 4, slot.params.n_keep); - LOG_VERBOSE("context shift", { - {"n_ctx", n_ctx}, - {"n_keep", params.n_keep}, - {"n_left", n_left}, - }); - } - } + // if input prompt is too big, truncate it + if (slot.num_prompt_tokens >= slot.n_ctx) + { + const int n_left = slot.n_ctx - slot.params.n_keep; + const int n_block_size = n_left / 2; + const int erased_blocks = (slot.num_prompt_tokens - slot.params.n_keep - n_block_size) / n_block_size; + + std::vector new_tokens(prompt_tokens.begin(), prompt_tokens.begin() + slot.params.n_keep); + new_tokens.insert(new_tokens.end(), prompt_tokens.begin() + slot.params.n_keep + erased_blocks * n_block_size, prompt_tokens.end()); + + LOG_VERBOSE("input truncated", { + {"n_ctx", slot.n_ctx}, + {"n_keep", slot.params.n_keep}, + {"n_left", n_left}, + {"new_tokens", tokens_to_str(ctx, new_tokens.cbegin(), new_tokens.cend())}, + }); + slot.truncated = true; + prompt_tokens = new_tokens; + + slot.num_prompt_tokens = prompt_tokens.size(); + GGML_ASSERT(slot.num_prompt_tokens < slot.n_ctx); + } + + // push the prompt into the sampling context (do not apply grammar) + for (auto &token : prompt_tokens) + { + llama_sampling_accept(slot.ctx_sampling, ctx, token, false); + } - // decode any currently ongoing sequences - for (auto &slot : slots) { - // release the slot - if (slot.command == RELEASE) { - slot.state = IDLE; - slot.command = NONE; - slot.t_last_used = ggml_time_us(); + slot.n_past = common_part(slot.cache_tokens, prompt_tokens); + slot.num_prompt_tokens_processed = slot.num_prompt_tokens - slot.n_past; - LOG_TEE("slot %d released (%d tokens in cache)\n", slot.id, - (int)slot.cache_tokens.size()); + LOG_TEE("slot %d : in cache: %i tokens | to process: %i tokens\n", slot.id, slot.n_past, slot.num_prompt_tokens_processed); + } - continue; - } + LOG_TEE("slot %d : kv cache rm - [%d, end)\n", slot.id, (int) system_tokens.size() + slot.n_past); - if (slot.state == IDLE) { - continue; - } + llama_kv_cache_seq_rm(ctx, slot.id, system_tokens.size() + slot.n_past, -1); - slot.i_batch = batch.n_tokens; + slot.cache_tokens = prompt_tokens; - llama_batch_add(batch, slot.sampled, system_tokens.size() + slot.n_past, - {slot.id}, true); + if (slot.n_past == slot.num_prompt_tokens) + { + // we have to evaluate at least 1 token to generate logits. + LOG_TEE("slot %d : we have to evaluate at least 1 token to generate logits\n", slot.id); + slot.n_past--; + } - slot.n_decoded += 1; - slot.n_past += 1; - } + LOG_VERBOSE("prompt ingested", { + {"n_past", slot.n_past}, + {"cached", tokens_to_str(ctx, slot.cache_tokens.cbegin(), slot.cache_tokens.cbegin() + slot.n_past)}, + {"to_eval", tokens_to_str(ctx, slot.cache_tokens.cbegin() + slot.n_past, slot.cache_tokens.cend())}, + }); - // process in chunks of params.n_batch - int32_t n_batch = params.n_batch; - - // assign workload to the slots - if (params.cont_batching || batch.n_tokens == 0) { - for (auto &slot : slots) { - const bool has_prompt = slot.prompt.is_array() || - (slot.prompt.is_string() && - !slot.prompt.get().empty()) || - !slot.images.empty(); - - // empty prompt passed -> release the slot and send empty response - if (slot.state == IDLE && slot.command == LOAD_PROMPT && !has_prompt) { - slot.release(); - slot.print_timings(); - send_final_response(slot); - continue; - } - - // need process the prompt - if (slot.state == IDLE && slot.command == LOAD_PROMPT) { - slot.state = PROCESSING; - slot.command = NONE; - std::vector prompt_tokens; - slot.t_start_process_prompt = ggml_time_us(); - slot.t_start_genereration = 0; - - if (slot.infill) { - bool suff_rm_leading_spc = true; - if (params.input_suffix.find_first_of(' ') == 0 && - params.input_suffix.size() > 1) { - params.input_suffix.erase(0, 1); - suff_rm_leading_spc = false; - } - auto prefix_tokens = tokenize(slot.params.input_prefix, false); - auto suffix_tokens = tokenize(slot.params.input_suffix, false); - - const int space_token = 29871; // TODO: this should not be hardcoded - if (suff_rm_leading_spc && !suffix_tokens.empty() && - suffix_tokens[0] == space_token) { - suffix_tokens.erase(suffix_tokens.begin()); - } - - prefix_tokens.insert(prefix_tokens.begin(), - llama_token_prefix(model)); - prefix_tokens.insert(prefix_tokens.begin(), - llama_token_bos(model)); // always add BOS - prefix_tokens.insert(prefix_tokens.end(), - llama_token_suffix(model)); - prefix_tokens.insert(prefix_tokens.end(), suffix_tokens.begin(), - suffix_tokens.end()); - prefix_tokens.push_back(llama_token_middle(model)); - prompt_tokens = prefix_tokens; - } else { - prompt_tokens = tokenize( - slot.prompt, - system_prompt.empty()); // add BOS if there isn't system prompt - } - - slot.num_prompt_tokens = prompt_tokens.size(); - - if (!slot.params.cache_prompt) { - llama_sampling_reset(slot.ctx_sampling); - - slot.n_past = 0; - slot.num_prompt_tokens_processed = slot.num_prompt_tokens; - } else { - if (slot.params.n_keep < 0) { - slot.params.n_keep = slot.num_prompt_tokens; - } - slot.params.n_keep = std::min(slot.n_ctx - 4, slot.params.n_keep); - - // if input prompt is too big, truncate it - if (slot.num_prompt_tokens >= slot.n_ctx) { - const int n_left = slot.n_ctx - slot.params.n_keep; - const int n_block_size = n_left / 2; - const int erased_blocks = - (slot.num_prompt_tokens - slot.params.n_keep - n_block_size) / - n_block_size; - - std::vector new_tokens(prompt_tokens.begin(), - prompt_tokens.begin() + - slot.params.n_keep); - new_tokens.insert(new_tokens.end(), - prompt_tokens.begin() + slot.params.n_keep + - erased_blocks * n_block_size, - prompt_tokens.end()); - - LOG_VERBOSE( - "input truncated", - { - {"n_ctx", slot.n_ctx}, - {"n_keep", slot.params.n_keep}, - {"n_left", n_left}, - {"new_tokens", tokens_to_str(ctx, new_tokens.cbegin(), - new_tokens.cend())}, - }); - slot.truncated = true; - prompt_tokens = new_tokens; - - slot.num_prompt_tokens = prompt_tokens.size(); - GGML_ASSERT(slot.num_prompt_tokens < slot.n_ctx); - } - - // push the prompt into the sampling context (do not apply grammar) - for (auto &token : prompt_tokens) { - llama_sampling_accept(slot.ctx_sampling, ctx, token, false); - } - - slot.n_past = common_part(slot.cache_tokens, prompt_tokens); - slot.num_prompt_tokens_processed = - slot.num_prompt_tokens - slot.n_past; - - LOG_TEE("slot %d : in cache: %i tokens | to process: %i tokens\n", - slot.id, slot.n_past, slot.num_prompt_tokens_processed); - } - - LOG_TEE("slot %d : kv cache rm - [%d, end)\n", slot.id, - (int)system_tokens.size() + slot.n_past); - - llama_kv_cache_seq_rm(ctx, slot.id, - system_tokens.size() + slot.n_past, -1); - - slot.cache_tokens = prompt_tokens; - - if (slot.n_past == slot.num_prompt_tokens) { - // we have to evaluate at least 1 token to generate logits. - LOG_TEE("slot %d : we have to evaluate at least 1 token to " - "generate logits\n", - slot.id); - slot.n_past--; - } - - LOG_VERBOSE( - "prompt ingested", - { - {"n_past", slot.n_past}, - {"cached", - tokens_to_str(ctx, slot.cache_tokens.cbegin(), - slot.cache_tokens.cbegin() + slot.n_past)}, - {"to_eval", - tokens_to_str(ctx, slot.cache_tokens.cbegin() + slot.n_past, - slot.cache_tokens.cend())}, - }); - - const bool has_images = process_images(slot); - - // process the prefix of first image - std::vector prefix_tokens = - has_images ? tokenize(slot.images[0].prefix_prompt, true) - : prompt_tokens; - for (; slot.n_past < (int)prefix_tokens.size(); ++slot.n_past) { - llama_batch_add(batch, prefix_tokens[slot.n_past], - system_tokens.size() + slot.n_past, {slot.id}, - false); - } - - if (has_images && !ingest_images(slot, n_batch)) { - LOG_TEE("failed processing images\n"); - return false; - } + const bool has_images = process_images(slot); - // extract the logits only for the last token - if (batch.n_tokens > 0) { - batch.logits[batch.n_tokens - 1] = true; - } + // process the prefix of first image + std::vector prefix_tokens = has_images ? tokenize(slot.images[0].prefix_prompt, true) : prompt_tokens; + for (; slot.n_past < (int) prefix_tokens.size(); ++slot.n_past) + { + llama_batch_add(batch, prefix_tokens[slot.n_past], system_tokens.size() + slot.n_past, { slot.id }, false); + } - slot.n_decoded = 0; - slot.i_batch = batch.n_tokens - 1; - } - } - } + if (has_images && !ingest_images(slot, n_batch)) + { + LOG_TEE("failed processing images\n"); + return false; + } - if (batch.n_tokens == 0) { - all_slots_are_idle = true; - return true; - } + // extract the logits only for the last token + if (batch.n_tokens > 0) + { + batch.logits[batch.n_tokens - 1] = true; + } - for (int32_t i = 0; i < (int32_t)batch.n_tokens; i += n_batch) { - const int32_t n_tokens = std::min(n_batch, (int32_t)(batch.n_tokens - i)); - llama_batch batch_view = { - n_tokens, - batch.token + i, - nullptr, - batch.pos + i, - batch.n_seq_id + i, - batch.seq_id + i, - batch.logits + i, - 0, - 0, - 0, // unused - }; + slot.n_decoded = 0; + slot.i_batch = batch.n_tokens - 1; + } + } + } - const int ret = llama_decode(ctx, batch_view); - if (ret != 0) { - if (n_batch == 1 || ret < 0) { - // if you get here, it means the KV cache is full - try increasing it - // via the context size - LOG_TEE("%s : failed to decode the batch, n_batch = %d, ret = %d\n", - __func__, n_batch, ret); - return false; + if (batch.n_tokens == 0) + { + all_slots_are_idle = true; + return true; } - LOG_TEE("%s : failed to find free space in the KV cache, retrying with " - "smaller n_batch = %d\n", - __func__, n_batch / 2); + for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) + { + const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i)); + llama_batch batch_view = + { + n_tokens, + batch.token + i, + nullptr, + batch.pos + i, + batch.n_seq_id + i, + batch.seq_id + i, + batch.logits + i, + 0, 0, 0, // unused + }; + + const int ret = llama_decode(ctx, batch_view); + if (ret != 0) + { + if (n_batch == 1 || ret < 0) + { + // if you get here, it means the KV cache is full - try increasing it via the context size + LOG_TEE("%s : failed to decode the batch, n_batch = %d, ret = %d\n", __func__, n_batch, ret); + return false; + } - // retry with half the batch size to try to find a free slot in the KV - // cache - n_batch /= 2; - i -= n_batch; - continue; - } + LOG_TEE("%s : failed to find free space in the KV cache, retrying with smaller n_batch = %d\n", __func__, n_batch / 2); - for (auto &slot : slots) { - if (slot.i_batch < (int)i || slot.i_batch >= (int)(i + n_tokens)) { - continue; - } + // retry with half the batch size to try to find a free slot in the KV cache + n_batch /= 2; + i -= n_batch; + continue; + } - // prompt evaluated for embedding - if (params.embedding) { - send_embedding(slot); - slot.release(); - slot.i_batch = -1; - return true; - } + for (auto & slot : slots) + { + if (slot.i_batch < (int) i || slot.i_batch >= (int) (i + n_tokens)) + { + continue; + } - completion_token_output result; - const llama_token id = llama_sampling_sample(slot.ctx_sampling, ctx, - NULL, slot.i_batch - i); + // prompt evaluated for embedding + if (slot.embedding) + { + send_embedding(slot); + slot.release(); + slot.i_batch = -1; + return true; + } - llama_sampling_accept(slot.ctx_sampling, ctx, id, true); + completion_token_output result; + const llama_token id = llama_sampling_sample(slot.ctx_sampling, ctx, NULL, slot.i_batch - i); - if (slot.n_decoded == 1) { - slot.t_start_genereration = ggml_time_us(); - slot.t_prompt_processing = - (slot.t_start_genereration - slot.t_start_process_prompt) / 1e3; - } + llama_sampling_accept(slot.ctx_sampling, ctx, id, true); - llama_token_data_array cur_p = {slot.ctx_sampling->cur.data(), - slot.ctx_sampling->cur.size(), false}; - result.tok = id; + if (slot.n_decoded == 1) + { + slot.t_start_genereration = ggml_time_us(); + slot.t_prompt_processing = (slot.t_start_genereration - slot.t_start_process_prompt) / 1e3; + } - const int32_t n_probs = slot.sparams.n_probs; - if (slot.sparams.temp <= 0 && n_probs > 0) { - // for llama_sample_token_greedy we need to sort candidates - llama_sample_softmax(ctx, &cur_p); - } + llama_token_data_array cur_p = { slot.ctx_sampling->cur.data(), slot.ctx_sampling->cur.size(), false }; + result.tok = id; - for (size_t i = 0; i < std::min(cur_p.size, (size_t)n_probs); ++i) { - result.probs.push_back({cur_p.data[i].id, cur_p.data[i].p}); - } + const int32_t n_probs = slot.sparams.n_probs; + if (slot.sparams.temp <= 0 && n_probs > 0) + { + // for llama_sample_token_greedy we need to sort candidates + llama_sample_softmax(ctx, &cur_p); + } - if (!process_token(result, slot)) { - slot.release(); - slot.print_timings(); - send_final_response(slot); - } + for (size_t i = 0; i < std::min(cur_p.size, (size_t)n_probs); ++i) + { + result.probs.push_back({cur_p.data[i].id, cur_p.data[i].p}); + } + + if (!process_token(result, slot)) + { + slot.release(); + slot.print_timings(); + send_final_response(slot); + } - slot.i_batch = -1; - } + slot.i_batch = -1; + } + } + return true; } - return true; - } }; static void server_print_usage(const char *argv0, const gpt_params ¶ms, - const server_params &sparams) { - printf("usage: %s [options]\n", argv0); - printf("\n"); - printf("options:\n"); - printf(" -h, --help show this help message and exit\n"); - printf(" -v, --verbose verbose output (default: %s)\n", - server_verbose ? "enabled" : "disabled"); - printf(" -t N, --threads N number of threads to use during " - "computation (default: %d)\n", - params.n_threads); - printf(" -tb N, --threads-batch N number of threads to use during batch " - "and prompt processing (default: same as --threads)\n"); - printf( - " -c N, --ctx-size N size of the prompt context (default: %d)\n", - params.n_ctx); - printf(" --rope-freq-base N RoPE base frequency (default: loaded " - "from model)\n"); - printf(" --rope-freq-scale N RoPE frequency scaling factor (default: " - "loaded from model)\n"); - printf(" -b N, --batch-size N batch size for prompt processing " - "(default: %d)\n", - params.n_batch); - printf(" --memory-f32 use f32 instead of f16 for memory " - "key+value (default: disabled)\n"); - printf(" not recommended: doubles context memory " - "required and no measurable increase in quality\n"); - if (llama_mlock_supported()) { - printf(" --mlock force system to keep model in RAM rather " - "than swapping or compressing\n"); - } - if (llama_mmap_supported()) { - printf(" --no-mmap do not memory-map model (slower load but " - "may reduce pageouts if not using mlock)\n"); - } - printf(" --numa attempt optimizations that help on some NUMA " - "systems\n"); + const server_params &sparams) +{ + printf("usage: %s [options]\n", argv0); + printf("\n"); + printf("options:\n"); + printf(" -h, --help show this help message and exit\n"); + printf(" -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled"); + printf(" -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads); + printf(" -tb N, --threads-batch N number of threads to use during batch and prompt processing (default: same as --threads)\n"); + printf(" -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx); + printf(" --rope-scaling {none,linear,yarn}\n"); + printf(" RoPE frequency scaling method, defaults to linear unless specified by the model\n"); + printf(" --rope-freq-base N RoPE base frequency (default: loaded from model)\n"); + printf(" --rope-freq-scale N RoPE frequency scaling factor, expands context by a factor of 1/N\n"); + printf(" --yarn-ext-factor N YaRN: extrapolation mix factor (default: 1.0, 0.0 = full interpolation)\n"); + printf(" --yarn-attn-factor N YaRN: scale sqrt(t) or attention magnitude (default: 1.0)\n"); + printf(" --yarn-beta-slow N YaRN: high correction dim or alpha (default: %.1f)\n", params.yarn_beta_slow); + printf(" --yarn-beta-fast N YaRN: low correction dim or beta (default: %.1f)\n", params.yarn_beta_fast); + printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); + printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n"); + printf(" not recommended: doubles context memory required and no measurable increase in quality\n"); + if (llama_mlock_supported()) + { + printf(" --mlock force system to keep model in RAM rather than swapping or compressing\n"); + } + if (llama_mmap_supported()) + { + printf(" --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n"); + } + printf(" --numa attempt optimizations that help on some NUMA systems\n"); #ifdef LLAMA_SUPPORTS_GPU_OFFLOAD - printf(" -ngl N, --n-gpu-layers N\n"); - printf(" number of layers to store in VRAM\n"); - printf(" -ts SPLIT --tensor-split SPLIT\n"); - printf(" how to split tensors across multiple GPUs, " - "comma-separated list of proportions, e.g. 3,1\n"); - printf( - " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n"); - printf(" -nommq, --no-mul-mat-q\n"); - printf(" use cuBLAS instead of custom mul_mat_q CUDA " - "kernels.\n"); - printf(" Not recommended since this is both slower " - "and uses more VRAM.\n"); + printf(" -ngl N, --n-gpu-layers N\n"); + printf(" number of layers to store in VRAM\n"); + printf(" -ts SPLIT --tensor-split SPLIT\n"); + printf(" how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n"); + printf(" -mg i, --main-gpu i the GPU to use for scratch and small tensors\n"); + printf(" -nommq, --no-mul-mat-q\n"); + printf(" use cuBLAS instead of custom mul_mat_q CUDA kernels.\n"); + printf(" Not recommended since this is both slower and uses more VRAM.\n"); #endif - printf(" -m FNAME, --model FNAME\n"); - printf(" model path (default: %s)\n", - params.model.c_str()); - printf(" -a ALIAS, --alias ALIAS\n"); - printf(" set an alias for the model, will be added as " - "`model` field in completion response\n"); - printf(" --lora FNAME apply LoRA adapter (implies --no-mmap)\n"); - printf(" --lora-base FNAME optional model to use as a base for the " - "layers modified by the LoRA adapter\n"); - printf( - " --host ip address to listen (default (default: %s)\n", - sparams.hostname.c_str()); - printf(" --port PORT port to listen (default (default: %d)\n", - sparams.port); - printf(" --path PUBLIC_PATH path from which to serve static files " - "(default %s)\n", - sparams.public_path.c_str()); - printf(" -to N, --timeout N server read/write timeout in seconds " - "(default: %d)\n", - sparams.read_timeout); - printf( - " --embedding enable embedding vector output (default: %s)\n", - params.embedding ? "enabled" : "disabled"); - printf(" -np N, --parallel N number of slots for process requests " - "(default: %d)\n", - params.n_parallel); - printf(" -cb, --cont-batching enable continuous batching (a.k.a dynamic " - "batching) (default: disabled)\n"); - printf(" -spf FNAME, --system-prompt-file FNAME\n"); - printf(" Set a file to load a system prompt (initial " - "prompt of all slots), this is useful for chat applications.\n"); - printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for " - "LLaVA.\n"); - printf("\n"); + printf(" -m FNAME, --model FNAME\n"); + printf(" model path (default: %s)\n", params.model.c_str()); + printf(" -a ALIAS, --alias ALIAS\n"); + printf(" set an alias for the model, will be added as `model` field in completion response\n"); + printf(" --lora FNAME apply LoRA adapter (implies --no-mmap)\n"); + printf(" --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n"); + printf(" --host ip address to listen (default (default: %s)\n", sparams.hostname.c_str()); + printf(" --port PORT port to listen (default (default: %d)\n", sparams.port); + printf(" --path PUBLIC_PATH path from which to serve static files (default %s)\n", sparams.public_path.c_str()); + printf(" -to N, --timeout N server read/write timeout in seconds (default: %d)\n", sparams.read_timeout); + printf(" --embedding enable embedding vector output (default: %s)\n", params.embedding ? "enabled" : "disabled"); + printf(" -np N, --parallel N number of slots for process requests (default: %d)\n", params.n_parallel); + printf(" -cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: disabled)\n"); + printf(" -spf FNAME, --system-prompt-file FNAME\n"); + printf(" Set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications.\n"); + printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA.\n"); + printf("\n"); } static void server_params_parse(int argc, char **argv, server_params &sparams, - gpt_params ¶ms, - llama_server_context &llama) { - gpt_params default_params; - server_params default_sparams; - std::string arg; - bool invalid_param = false; - - for (int i = 1; i < argc; i++) { - arg = argv[i]; - if (arg == "--port") { - if (++i >= argc) { - invalid_param = true; - break; - } - sparams.port = std::stoi(argv[i]); - } else if (arg == "--host") { - if (++i >= argc) { - invalid_param = true; - break; - } - sparams.hostname = argv[i]; - } else if (arg == "--path") { - if (++i >= argc) { - invalid_param = true; - break; - } - sparams.public_path = argv[i]; - } else if (arg == "--timeout" || arg == "-to") { - if (++i >= argc) { - invalid_param = true; - break; - } - sparams.read_timeout = std::stoi(argv[i]); - sparams.write_timeout = std::stoi(argv[i]); - } else if (arg == "-m" || arg == "--model") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.model = argv[i]; - } else if (arg == "-a" || arg == "--alias") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.model_alias = argv[i]; - } else if (arg == "-h" || arg == "--help") { - server_print_usage(argv[0], default_params, default_sparams); - exit(0); - } else if (arg == "-c" || arg == "--ctx-size" || arg == "--ctx_size") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.n_ctx = std::stoi(argv[i]); - } else if (arg == "--rope-freq-base") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.rope_freq_base = std::stof(argv[i]); - } else if (arg == "--rope-freq-scale") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.rope_freq_scale = std::stof(argv[i]); - } else if (arg == "--memory-f32" || arg == "--memory_f32") { - params.memory_f16 = false; - } else if (arg == "--threads" || arg == "-t") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.n_threads = std::stoi(argv[i]); - } else if (arg == "--threads-batch" || arg == "-tb") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.n_threads_batch = std::stoi(argv[i]); - } else if (arg == "-b" || arg == "--batch-size") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.n_batch = std::stoi(argv[i]); - params.n_batch = std::min(512, params.n_batch); - } else if (arg == "--gpu-layers" || arg == "-ngl" || - arg == "--n-gpu-layers") { - if (++i >= argc) { - invalid_param = true; - break; - } + gpt_params ¶ms, llama_server_context& llama) +{ + gpt_params default_params; + server_params default_sparams; + std::string arg; + bool invalid_param = false; + + for (int i = 1; i < argc; i++) + { + arg = argv[i]; + if (arg == "--port") + { + if (++i >= argc) + { + invalid_param = true; + break; + } + sparams.port = std::stoi(argv[i]); + } + else if (arg == "--host") + { + if (++i >= argc) + { + invalid_param = true; + break; + } + sparams.hostname = argv[i]; + } + else if (arg == "--path") + { + if (++i >= argc) + { + invalid_param = true; + break; + } + sparams.public_path = argv[i]; + } + else if (arg == "--timeout" || arg == "-to") + { + if (++i >= argc) + { + invalid_param = true; + break; + } + sparams.read_timeout = std::stoi(argv[i]); + sparams.write_timeout = std::stoi(argv[i]); + } + else if (arg == "-m" || arg == "--model") + { + if (++i >= argc) + { + invalid_param = true; + break; + } + params.model = argv[i]; + } + else if (arg == "-a" || arg == "--alias") + { + if (++i >= argc) + { + invalid_param = true; + break; + } + params.model_alias = argv[i]; + } + else if (arg == "-h" || arg == "--help") + { + server_print_usage(argv[0], default_params, default_sparams); + exit(0); + } + else if (arg == "-c" || arg == "--ctx-size" || arg == "--ctx_size") + { + if (++i >= argc) + { + invalid_param = true; + break; + } + params.n_ctx = std::stoi(argv[i]); + } + else if (arg == "--rope-scaling") + { + if (++i >= argc) + { + invalid_param = true; + break; + } + std::string value(argv[i]); + /**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_NONE; } + else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_LINEAR; } + else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_YARN; } + else { invalid_param = true; break; } + } + else if (arg == "--rope-freq-base") + { + if (++i >= argc) + { + invalid_param = true; + break; + } + params.rope_freq_base = std::stof(argv[i]); + } + else if (arg == "--rope-freq-scale") + { + if (++i >= argc) + { + invalid_param = true; + break; + } + params.rope_freq_scale = std::stof(argv[i]); + } + else if (arg == "--yarn-ext-factor") + { + if (++i >= argc) { + invalid_param = true; + break; + } + params.yarn_ext_factor = std::stof(argv[i]); + } + else if (arg == "--yarn-attn-factor") + { + if (++i >= argc) { + invalid_param = true; + break; + } + params.yarn_attn_factor = std::stof(argv[i]); + } + else if (arg == "--yarn-beta-fast") + { + if (++i >= argc) { + invalid_param = true; + break; + } + params.yarn_beta_fast = std::stof(argv[i]); + } + else if (arg == "--yarn-beta-slow") + { + if (++i >= argc) { + invalid_param = true; + break; + } + params.yarn_beta_slow = std::stof(argv[i]); + } + else if (arg == "--memory-f32" || arg == "--memory_f32") + { + params.memory_f16 = false; + } + else if (arg == "--threads" || arg == "-t") + { + if (++i >= argc) + { + invalid_param = true; + break; + } + params.n_threads = std::stoi(argv[i]); + } + else if (arg == "--threads-batch" || arg == "-tb") + { + if (++i >= argc) + { + invalid_param = true; + break; + } + params.n_threads_batch = std::stoi(argv[i]); + } + else if (arg == "-b" || arg == "--batch-size") + { + if (++i >= argc) + { + invalid_param = true; + break; + } + params.n_batch = std::stoi(argv[i]); + params.n_batch = std::min(512, params.n_batch); + } + else if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers") + { + if (++i >= argc) + { + invalid_param = true; + break; + } #ifdef LLAMA_SUPPORTS_GPU_OFFLOAD - params.n_gpu_layers = std::stoi(argv[i]); + params.n_gpu_layers = std::stoi(argv[i]); #else - LOG_WARNING_LLAMA( - "Not compiled with GPU offload support, --n-gpu-layers option will " - "be ignored. " - "See main README.md for information on enabling GPU BLAS support", - {{"n_gpu_layers", params.n_gpu_layers}}); + LOG_WARNING_LLAMA("Not compiled with GPU offload support, --n-gpu-layers option will be ignored. " + "See main README.md for information on enabling GPU BLAS support", + {{"n_gpu_layers", params.n_gpu_layers}}); #endif - } else if (arg == "--tensor-split" || arg == "-ts") { - if (++i >= argc) { - invalid_param = true; - break; - } + } + else if (arg == "--tensor-split" || arg == "-ts") + { + if (++i >= argc) + { + invalid_param = true; + break; + } #ifdef GGML_USE_CUBLAS - std::string arg_next = argv[i]; - - // split string by , and / - const std::regex regex{R"([,/]+)"}; - std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, - -1}; - std::vector split_arg{it, {}}; - GGML_ASSERT(split_arg.size() <= LLAMA_MAX_DEVICES); - - for (size_t i_device = 0; i_device < LLAMA_MAX_DEVICES; ++i_device) { - if (i_device < split_arg.size()) { - params.tensor_split[i_device] = std::stof(split_arg[i_device]); - } else { - params.tensor_split[i_device] = 0.0f; - } - } + std::string arg_next = argv[i]; + + // split string by , and / + const std::regex regex{R"([,/]+)"}; + std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1}; + std::vector split_arg{it, {}}; + GGML_ASSERT(split_arg.size() <= LLAMA_MAX_DEVICES); + + for (size_t i_device = 0; i_device < LLAMA_MAX_DEVICES; ++i_device) + { + if (i_device < split_arg.size()) + { + params.tensor_split[i_device] = std::stof(split_arg[i_device]); + } + else + { + params.tensor_split[i_device] = 0.0f; + } + } #else - LOG_WARNING_LLAMA("llama.cpp was compiled without cuBLAS. It is not " - "possible to set a tensor split.\n", - {}); + LOG_WARNING_LLAMA("llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n", {}); #endif // GGML_USE_CUBLAS - } else if (arg == "--no-mul-mat-q" || arg == "-nommq") { + } + else if (arg == "--no-mul-mat-q" || arg == "-nommq") + { #ifdef GGML_USE_CUBLAS - params.mul_mat_q = false; + params.mul_mat_q = false; #else - LOG_WARNING_LLAMA("warning: llama.cpp was compiled without cuBLAS. " - "Disabling mul_mat_q kernels has no effect.\n", - {}); + LOG_WARNING_LLAMA("warning: llama.cpp was compiled without cuBLAS. Disabling mul_mat_q kernels has no effect.\n", {}); #endif // GGML_USE_CUBLAS - } else if (arg == "--main-gpu" || arg == "-mg") { - if (++i >= argc) { - invalid_param = true; - break; - } + } + else if (arg == "--main-gpu" || arg == "-mg") + { + if (++i >= argc) + { + invalid_param = true; + break; + } #ifdef GGML_USE_CUBLAS - params.main_gpu = std::stoi(argv[i]); + params.main_gpu = std::stoi(argv[i]); #else - LOG_WARNING_LLAMA("llama.cpp was compiled without cuBLAS. It is not " - "possible to set a main GPU.", - {}); + LOG_WARNING_LLAMA("llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.", {}); #endif - } else if (arg == "--lora") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.lora_adapter.push_back(std::make_tuple(argv[i], 1.0f)); - params.use_mmap = false; - } else if (arg == "--lora-scaled") { - if (++i >= argc) { - invalid_param = true; - break; - } - const char *lora_adapter = argv[i]; - if (++i >= argc) { - invalid_param = true; - break; - } - params.lora_adapter.push_back( - std::make_tuple(lora_adapter, std::stof(argv[i]))); - params.use_mmap = false; - } else if (arg == "--lora-base") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.lora_base = argv[i]; - } else if (arg == "-v" || arg == "--verbose") { + } + else if (arg == "--lora") + { + if (++i >= argc) + { + invalid_param = true; + break; + } + params.lora_adapter.push_back(std::make_tuple(argv[i], 1.0f)); + params.use_mmap = false; + } + else if (arg == "--lora-scaled") + { + if (++i >= argc) + { + invalid_param = true; + break; + } + const char * lora_adapter = argv[i]; + if (++i >= argc) + { + invalid_param = true; + break; + } + params.lora_adapter.push_back(std::make_tuple(lora_adapter, std::stof(argv[i]))); + params.use_mmap = false; + } + else if (arg == "--lora-base") + { + if (++i >= argc) + { + invalid_param = true; + break; + } + params.lora_base = argv[i]; + } + else if (arg == "-v" || arg == "--verbose") + { #if SERVER_VERBOSE != 1 - LOG_WARNING_LLAMA("server.cpp is not built with verbose logging.", {}); + LOG_WARNING_LLAMA("server.cpp is not built with verbose logging.", {}); #else - server_verbose = true; + server_verbose = true; #endif - } else if (arg == "--mlock") { - params.use_mlock = true; - } else if (arg == "--no-mmap") { - params.use_mmap = false; - } else if (arg == "--numa") { - params.numa = true; - } else if (arg == "--embedding") { - params.embedding = true; - } else if (arg == "-cb" || arg == "--cont-batching") { - params.cont_batching = true; - } else if (arg == "-np" || arg == "--parallel") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.n_parallel = std::stoi(argv[i]); - } else if (arg == "-n" || arg == "--n-predict") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.n_predict = std::stoi(argv[i]); - } else if (arg == "-spf" || arg == "--system-prompt-file") { - if (++i >= argc) { - invalid_param = true; - break; - } - std::ifstream file(argv[i]); - if (!file) { - fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); - invalid_param = true; - break; - } - std::string systm_content; - std::copy(std::istreambuf_iterator(file), + } + else if (arg == "--mlock") + { + params.use_mlock = true; + } + else if (arg == "--no-mmap") + { + params.use_mmap = false; + } + else if (arg == "--numa") + { + params.numa = true; + } + else if (arg == "--embedding") + { + params.embedding = true; + } + else if (arg == "-cb" || arg == "--cont-batching") + { + params.cont_batching = true; + } + else if (arg == "-np" || arg == "--parallel") + { + if (++i >= argc) + { + invalid_param = true; + break; + } + params.n_parallel = std::stoi(argv[i]); + } else if (arg == "-n" || arg == "--n-predict") + { + if (++i >= argc) + { + invalid_param = true; + break; + } + params.n_predict = std::stoi(argv[i]); + } else if (arg == "-spf" || arg == "--system-prompt-file") + { + if (++i >= argc) + { + invalid_param = true; + break; + } + std::ifstream file(argv[i]); + if (!file) { + fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); + invalid_param = true; + break; + } + std::string systm_content; + std::copy( + std::istreambuf_iterator(file), std::istreambuf_iterator(), - std::back_inserter(systm_content)); - llama.process_system_prompt_data(json::parse(systm_content)); - } else if (arg == "--mmproj") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.mmproj = argv[i]; - } else { - fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); - server_print_usage(argv[0], default_params, default_sparams); - exit(1); + std::back_inserter(systm_content) + ); + llama.process_system_prompt_data(json::parse(systm_content)); + } + else if(arg == "--mmproj") + { + if (++i >= argc) + { + invalid_param = true; + break; + } + params.mmproj = argv[i]; + } + else + { + fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); + server_print_usage(argv[0], default_params, default_sparams); + exit(1); + } } - } - if (invalid_param) { - fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str()); - server_print_usage(argv[0], default_params, default_sparams); - exit(1); - } + if (invalid_param) + { + fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str()); + server_print_usage(argv[0], default_params, default_sparams); + exit(1); + } } -static json -format_partial_response(llama_server_context &llama, llama_client_slot *slot, - const std::string &content, - const std::vector &probs) { - json res = json{{"content", content}, - {"stop", false}, - {"slot_id", slot->id}, - {"multimodal", llama.multimodal}}; - - if (slot->sparams.n_probs > 0) { - res["completion_probabilities"] = probs_vector_to_json(llama.ctx, probs); - } +static json format_partial_response( + llama_server_context &llama, llama_client_slot *slot, const std::string &content, const std::vector &probs +) { + json res = json + { + {"content", content }, + {"stop", false}, + {"slot_id", slot->id }, + {"multimodal", llama.multimodal } + }; - return res; + if (slot->sparams.n_probs > 0) + { + res["completion_probabilities"] = probs_vector_to_json(llama.ctx, probs); + } + + return res; } -static json format_tokenizer_response(const std::vector &tokens) { - return json{{"tokens", tokens}}; +static json format_tokenizer_response(const std::vector &tokens) +{ + return json{ + {"tokens", tokens}}; } -static json format_detokenized_response(std::string content) { - return json{{"content", content}}; +static json format_detokenized_response(std::string content) +{ + return json{ + {"content", content}}; } -struct token_translator { - llama_context *ctx; - std::string operator()(llama_token tok) const { - return llama_token_to_piece(ctx, tok); - } - std::string operator()(const completion_token_output &cto) const { - return (*this)(cto.tok); - } +struct token_translator +{ + llama_context * ctx; + std::string operator()(llama_token tok) const { return llama_token_to_piece(ctx, tok); } + std::string operator()(const completion_token_output &cto) const { return (*this)(cto.tok); } }; -static void -append_to_generated_text_from_generated_token_probs(llama_server_context &llama, - llama_client_slot *slot) { - auto >ps = slot->generated_token_probs; - auto translator = token_translator{llama.ctx}; - auto add_strlen = [=](size_t sum, const completion_token_output &cto) { - return sum + translator(cto).size(); - }; - const size_t len = - std::accumulate(gtps.begin(), gtps.end(), size_t(0), add_strlen); - if (slot->generated_text.capacity() < slot->generated_text.size() + len) { - slot->generated_text.reserve(slot->generated_text.size() + len); - } - for (const completion_token_output &cto : gtps) { - slot->generated_text += translator(cto); - } +static void append_to_generated_text_from_generated_token_probs(llama_server_context &llama, llama_client_slot *slot) +{ + auto & gtps = slot->generated_token_probs; + auto translator = token_translator{llama.ctx}; + auto add_strlen = [=](size_t sum, const completion_token_output & cto) { return sum + translator(cto).size(); }; + const size_t len = std::accumulate(gtps.begin(), gtps.end(), size_t(0), add_strlen); + if (slot->generated_text.capacity() < slot->generated_text.size() + len) + { + slot->generated_text.reserve(slot->generated_text.size() + len); + } + for (const completion_token_output & cto : gtps) + { + slot->generated_text += translator(cto); + } } + using namespace drogon; namespace inferences {