From 44eeb6a88e480ec10dda58f79c82f9bb74d63b0a Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 16 Dec 2024 21:03:24 +0200 Subject: [PATCH 01/12] server : add "tokens" output ggml-ci --- examples/server/server.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 40aac33f0bf13..5b6d660b848d2 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -704,6 +704,7 @@ struct server_task_result_cmpl_partial : server_task_result { {"delta", json { {"content", content}, + {"tokens", tokens} }}, }}); } @@ -1003,6 +1004,7 @@ struct server_slot { n_prompt_tokens = 0; last_nl_pos = 0; generated_text = ""; + generated_tokens = {}; has_new_line = false; truncated = false; stop = STOP_TYPE_NONE; From 07946a3a309c3b4f62053e7cee5ef282feb66b4f Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 17 Dec 2024 10:56:20 +0200 Subject: [PATCH 02/12] server : output embeddings for all tokens when pooling = none ggml-ci --- examples/server/server.cpp | 19 +++++++++++++++---- examples/server/tests/unit/test_embedding.py | 12 ++++++++++++ examples/server/tests/utils.py | 6 +++++- 3 files changed, 32 insertions(+), 5 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 5b6d660b848d2..eac9ada5f523f 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -727,7 +727,7 @@ struct server_task_result_cmpl_partial : server_task_result { struct server_task_result_embd : server_task_result { int index = 0; - std::vector embedding; + std::vector> embedding; int32_t n_tokens; @@ -736,6 +736,14 @@ struct server_task_result_embd : server_task_result { } virtual json to_json() override { + if (embedding.size() == 1){ + // to be OAI compatible + return json { + {"index", index}, + {"embedding", embedding[0]}, + }; + } + return json { {"index", index}, {"embedding", embedding}, @@ -2040,12 +2048,12 @@ struct server_context { if (embd == NULL) { SLT_ERR(slot, "failed to get embeddings, token = %d, seq_id = %d\n", batch.token[i], batch.seq_id[i][0]); - res->embedding = std::vector(n_embd, 0.0f); + res->embedding.push_back(std::vector(n_embd, 0.0f)); continue; } common_embd_normalize(embd, embd_res.data(), n_embd); - res->embedding = embd_res; + res->embedding.push_back(embd_res); } SLT_DBG(slot, "%s", "sending embeddings\n"); @@ -2659,7 +2667,10 @@ struct server_context { // add prompt tokens for processing in the current batch while (slot.n_past < slot.n_prompt_tokens && batch.n_tokens < n_batch) { - common_batch_add(batch, prompt_tokens[slot.n_past], slot.n_past, { slot.id }, false); + // without pooling, we want to output the embeddings for all the tokens in the batch + const bool need_embd = slot.task_type == SERVER_TASK_TYPE_EMBEDDING && llama_pooling_type(slot.ctx) == LLAMA_POOLING_TYPE_NONE; + + common_batch_add(batch, prompt_tokens[slot.n_past], slot.n_past, { slot.id }, need_embd); if (slot.params.cache_prompt) { slot.cache_tokens.push_back(prompt_tokens[slot.n_past]); diff --git a/examples/server/tests/unit/test_embedding.py b/examples/server/tests/unit/test_embedding.py index 4f4e9dcf087fa..d6a3b612531b2 100644 --- a/examples/server/tests/unit/test_embedding.py +++ b/examples/server/tests/unit/test_embedding.py @@ -74,6 +74,18 @@ def test_embedding_mixed_input(content, is_multi_prompt: bool): assert len(res.body['embedding']) > 1 +def test_embedding_pooling_none(): + server = ServerPreset.bert_bge_small(pooling = 'none') + server.start() + res = server.make_request("POST", "/embeddings", data={ + "input": "hello hello hello", + }) + assert res.status_code == 200 + assert len(res.body['data']) == 1 + assert 'embedding' in res.body['data'][0] + assert len(res.body['data'][0]['embedding']) == 3 + + def test_embedding_openai_library_single(): global server server.start() diff --git a/examples/server/tests/utils.py b/examples/server/tests/utils.py index d988ccf5e3061..da95c830b036d 100644 --- a/examples/server/tests/utils.py +++ b/examples/server/tests/utils.py @@ -65,6 +65,7 @@ class ServerProcess: server_reranking: bool | None = False server_metrics: bool | None = False server_slots: bool | None = False + pooling: str | None = None draft: int | None = None api_key: str | None = None response_format: str | None = None @@ -132,6 +133,8 @@ def start(self, timeout_seconds: int = 10) -> None: server_args.append("--metrics") if self.server_slots: server_args.append("--slots") + if self.pooling: + server_args.extend(["--pooling", self.pooling]) if self.model_alias: server_args.extend(["--alias", self.model_alias]) if self.n_ctx: @@ -272,7 +275,7 @@ def tinyllama2() -> ServerProcess: return server @staticmethod - def bert_bge_small() -> ServerProcess: + def bert_bge_small(pooling = 'last') -> ServerProcess: server = ServerProcess() server.model_hf_repo = "ggml-org/models" server.model_hf_file = "bert-bge-small/ggml-model-f16.gguf" @@ -283,6 +286,7 @@ def bert_bge_small() -> ServerProcess: server.n_slots = 2 server.seed = 42 server.server_embeddings = True + server.pooling = pooling return server @staticmethod From d424afac5fef1232aeeb9fc32c27ef9cb8ea6369 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 17 Dec 2024 11:01:29 +0200 Subject: [PATCH 03/12] server : update readme [no ci] --- examples/server/README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/examples/server/README.md b/examples/server/README.md index ecd24c899fc86..0787bfc306cfb 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -795,6 +795,8 @@ See [OpenAI Embeddings API documentation](https://platform.openai.com/docs/api-r }' ``` +When `--pooling none` is used, the server will output an array of embeddings - one for each token in the input. + ### GET `/slots`: Returns the current slots processing state > [!WARNING] From 2dea48758e3935bbc20255619888ec220a969765 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 17 Dec 2024 11:37:08 +0200 Subject: [PATCH 04/12] server : fix spacing [no ci] Co-authored-by: Xuan Son Nguyen --- examples/server/server.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index eac9ada5f523f..2327d9ee74b47 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -736,7 +736,7 @@ struct server_task_result_embd : server_task_result { } virtual json to_json() override { - if (embedding.size() == 1){ + if (embedding.size() == 1) { // to be OAI compatible return json { {"index", index}, From 2a94c330284b6bf2c4629199ea9d26be6c3c95fc Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 17 Dec 2024 11:45:18 +0200 Subject: [PATCH 05/12] server : be explicit about the pooling type in the tests ggml-ci --- examples/server/tests/unit/test_embedding.py | 9 ++++++++- examples/server/tests/utils.py | 3 +-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/examples/server/tests/unit/test_embedding.py b/examples/server/tests/unit/test_embedding.py index d6a3b612531b2..f60034b4ab5c1 100644 --- a/examples/server/tests/unit/test_embedding.py +++ b/examples/server/tests/unit/test_embedding.py @@ -14,6 +14,7 @@ def create_server(): def test_embedding_single(): global server + server.pooling = 'last' server.start() res = server.make_request("POST", "/embeddings", data={ "input": "I believe the meaning of life is", @@ -29,6 +30,7 @@ def test_embedding_single(): def test_embedding_multiple(): global server + server.pooling = 'last' server.start() res = server.make_request("POST", "/embeddings", data={ "input": [ @@ -75,7 +77,8 @@ def test_embedding_mixed_input(content, is_multi_prompt: bool): def test_embedding_pooling_none(): - server = ServerPreset.bert_bge_small(pooling = 'none') + global server + server.pooling = 'none' server.start() res = server.make_request("POST", "/embeddings", data={ "input": "hello hello hello", @@ -88,6 +91,7 @@ def test_embedding_pooling_none(): def test_embedding_openai_library_single(): global server + server.pooling = 'last' server.start() client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}") res = client.embeddings.create(model="text-embedding-3-small", input="I believe the meaning of life is") @@ -97,6 +101,7 @@ def test_embedding_openai_library_single(): def test_embedding_openai_library_multiple(): global server + server.pooling = 'last' server.start() client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}") res = client.embeddings.create(model="text-embedding-3-small", input=[ @@ -112,6 +117,7 @@ def test_embedding_openai_library_multiple(): def test_embedding_error_prompt_too_long(): global server + server.pooling = 'last' server.start() res = server.make_request("POST", "/embeddings", data={ "input": "This is a test " * 512, @@ -121,6 +127,7 @@ def test_embedding_error_prompt_too_long(): def test_same_prompt_give_same_result(): + server.pooling = 'last' server.start() res = server.make_request("POST", "/embeddings", data={ "input": [ diff --git a/examples/server/tests/utils.py b/examples/server/tests/utils.py index da95c830b036d..277125e88b534 100644 --- a/examples/server/tests/utils.py +++ b/examples/server/tests/utils.py @@ -275,7 +275,7 @@ def tinyllama2() -> ServerProcess: return server @staticmethod - def bert_bge_small(pooling = 'last') -> ServerProcess: + def bert_bge_small() -> ServerProcess: server = ServerProcess() server.model_hf_repo = "ggml-org/models" server.model_hf_file = "bert-bge-small/ggml-model-f16.gguf" @@ -286,7 +286,6 @@ def bert_bge_small(pooling = 'last') -> ServerProcess: server.n_slots = 2 server.seed = 42 server.server_embeddings = True - server.pooling = pooling return server @staticmethod From abf33e2017936a15ce1d1bf1c0a2956b16e28e1f Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 17 Dec 2024 15:59:55 +0200 Subject: [PATCH 06/12] server : update /embeddings and /v1/embeddings endpoints ggml-ci --- examples/server/server.cpp | 59 +++++++++++++------- examples/server/tests/unit/test_embedding.py | 28 +++++++--- 2 files changed, 59 insertions(+), 28 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 2327d9ee74b47..bb0cd1f2b1239 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -731,25 +731,31 @@ struct server_task_result_embd : server_task_result { int32_t n_tokens; + // OAI-compat fields + bool oaicompat = false; + virtual int get_index() override { return index; } virtual json to_json() override { - if (embedding.size() == 1) { - // to be OAI compatible - return json { - {"index", index}, - {"embedding", embedding[0]}, - }; - } + return oaicompat ? to_json_oaicompat() : to_json_non_oaicompat(); + } + json to_json_non_oaicompat() { return json { {"index", index}, {"embedding", embedding}, {"tokens_evaluated", n_tokens}, }; } + + json to_json_oaicompat() { + return json { + {"index", index}, + {"embedding", embedding[0]}, + }; + } }; struct server_task_result_rerank : server_task_result { @@ -2027,9 +2033,10 @@ struct server_context { void send_embedding(const server_slot & slot, const llama_batch & batch) { auto res = std::make_unique(); - res->id = slot.id_task; - res->index = slot.index; - res->n_tokens = slot.n_prompt_tokens; + res->id = slot.id_task; + res->index = slot.index; + res->n_tokens = slot.n_prompt_tokens; + res->oaicompat = slot.params.oaicompat; const int n_embd = llama_n_embd(model); @@ -3678,14 +3685,17 @@ int main(int argc, char ** argv) { res_ok(res, data); }; - const auto handle_embeddings = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res) { + const auto handle_embeddings_impl = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res, bool oaicompat) { const json body = json::parse(req.body); - bool oaicompat = false; + + if (oaicompat && llama_pooling_type(ctx_server.ctx) == LLAMA_POOLING_TYPE_NONE) { + res_error(res, format_error_response("Pooling type 'none' is not OAI compatible. Please use a different pooling type", ERROR_TYPE_INVALID_REQUEST)); + return; + } // for the shape of input/content, see tokenize_input_prompts() json prompt; - if (body.contains("input")) { - oaicompat = true; + if (body.count("input") != 0) { prompt = body.at("input"); } else if (body.contains("content")) { oaicompat = false; @@ -3710,10 +3720,15 @@ int main(int argc, char ** argv) { { std::vector tasks; for (size_t i = 0; i < tokenized_prompts.size(); i++) { - server_task task = server_task(SERVER_TASK_TYPE_EMBEDDING); + server_task task = server_task(SERVER_TASK_TYPE_EMBEDDING); + task.id = ctx_server.queue_tasks.get_new_id(); task.index = i; task.prompt_tokens = std::move(tokenized_prompts[i]); + + // OAI-compat + task.params.oaicompat = oaicompat;; + tasks.push_back(task); } @@ -3741,12 +3756,18 @@ int main(int argc, char ** argv) { } // write JSON response - json root = oaicompat - ? format_embeddings_response_oaicompat(body, responses) - : responses.size() == 1 ? responses[0] : json(responses); + json root = oaicompat ? format_embeddings_response_oaicompat(body, responses) : json(responses); res_ok(res, root); }; + const auto handle_embeddings = [&handle_embeddings_impl](const httplib::Request & req, httplib::Response & res) { + handle_embeddings_impl(req, res, false); + }; + + const auto handle_embeddings_oai = [&handle_embeddings_impl](const httplib::Request & req, httplib::Response & res) { + handle_embeddings_impl(req, res, true); + }; + const auto handle_rerank = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res) { if (!ctx_server.params_base.reranking || ctx_server.params_base.embedding) { res_error(res, format_error_response("This server does not support reranking. Start it with `--reranking` and without `--embedding`", ERROR_TYPE_NOT_SUPPORTED)); @@ -3920,7 +3941,7 @@ int main(int argc, char ** argv) { svr->Post("/infill", handle_infill); svr->Post("/embedding", handle_embeddings); // legacy svr->Post("/embeddings", handle_embeddings); - svr->Post("/v1/embeddings", handle_embeddings); + svr->Post("/v1/embeddings", handle_embeddings_oai); svr->Post("/rerank", handle_rerank); svr->Post("/reranking", handle_rerank); svr->Post("/v1/rerank", handle_rerank); diff --git a/examples/server/tests/unit/test_embedding.py b/examples/server/tests/unit/test_embedding.py index f60034b4ab5c1..32088d0a044d9 100644 --- a/examples/server/tests/unit/test_embedding.py +++ b/examples/server/tests/unit/test_embedding.py @@ -16,7 +16,7 @@ def test_embedding_single(): global server server.pooling = 'last' server.start() - res = server.make_request("POST", "/embeddings", data={ + res = server.make_request("POST", "/v1/embeddings", data={ "input": "I believe the meaning of life is", }) assert res.status_code == 200 @@ -32,7 +32,7 @@ def test_embedding_multiple(): global server server.pooling = 'last' server.start() - res = server.make_request("POST", "/embeddings", data={ + res = server.make_request("POST", "/v1/embeddings", data={ "input": [ "I believe the meaning of life is", "Write a joke about AI from a very long prompt which will not be truncated", @@ -84,16 +84,26 @@ def test_embedding_pooling_none(): "input": "hello hello hello", }) assert res.status_code == 200 - assert len(res.body['data']) == 1 - assert 'embedding' in res.body['data'][0] - assert len(res.body['data'][0]['embedding']) == 3 + assert 'embedding' in res.body[0] + assert len(res.body[0]['embedding']) == 3 + + +def test_embedding_pooling_none_oai(): + global server + server.pooling = 'none' + server.start() + res = server.make_request("POST", "/v1/embeddings", data={ + "input": "hello hello hello", + }) + # /v1/embeddings does not support pooling type 'none' + assert res.status_code == 400 def test_embedding_openai_library_single(): global server server.pooling = 'last' server.start() - client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}") + client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}/v1") res = client.embeddings.create(model="text-embedding-3-small", input="I believe the meaning of life is") assert len(res.data) == 1 assert len(res.data[0].embedding) > 1 @@ -103,7 +113,7 @@ def test_embedding_openai_library_multiple(): global server server.pooling = 'last' server.start() - client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}") + client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}/v1") res = client.embeddings.create(model="text-embedding-3-small", input=[ "I believe the meaning of life is", "Write a joke about AI from a very long prompt which will not be truncated", @@ -119,7 +129,7 @@ def test_embedding_error_prompt_too_long(): global server server.pooling = 'last' server.start() - res = server.make_request("POST", "/embeddings", data={ + res = server.make_request("POST", "/v1/embeddings", data={ "input": "This is a test " * 512, }) assert res.status_code != 200 @@ -129,7 +139,7 @@ def test_embedding_error_prompt_too_long(): def test_same_prompt_give_same_result(): server.pooling = 'last' server.start() - res = server.make_request("POST", "/embeddings", data={ + res = server.make_request("POST", "/v1/embeddings", data={ "input": [ "I believe the meaning of life is", "I believe the meaning of life is", From 7e693f92d7b59c31025fc4a866b713a768c9137b Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 17 Dec 2024 13:36:32 +0200 Subject: [PATCH 07/12] server : do not normalize embeddings when there is no pooling ggml-ci --- common/common.cpp | 4 +++- common/common.h | 3 ++- examples/gritlm/gritlm.cpp | 2 +- examples/retrieval/retrieval.cpp | 2 +- examples/server/server.cpp | 10 ++++++++-- examples/server/tests/unit/test_embedding.py | 5 +++++ 6 files changed, 20 insertions(+), 6 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index c0c98232ed3bb..05d3ba766e38b 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1780,7 +1780,9 @@ void common_embd_normalize(const float * inp, float * out, int n, int embd_norm) break; case 0: // max absolute for (int i = 0; i < n; i++) { - if (sum < std::abs(inp[i])) sum = std::abs(inp[i]); + if (sum < std::abs(inp[i])) { + sum = std::abs(inp[i]); + } } sum /= 32760.0; // make an int16 range break; diff --git a/common/common.h b/common/common.h index 5f556c24d933c..ec0e49f6f1806 100644 --- a/common/common.h +++ b/common/common.h @@ -596,7 +596,8 @@ void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_si // Embedding utils // -void common_embd_normalize(const float * inp, float * out, int n, int embd_norm = 2); +// TODO: repace embd_norm with an enum +void common_embd_normalize(const float * inp, float * out, int n, int embd_norm); float common_embd_similarity_cos(const float * embd1, const float * embd2, int n); diff --git a/examples/gritlm/gritlm.cpp b/examples/gritlm/gritlm.cpp index 6e42fa0734ecb..18a945b33905f 100644 --- a/examples/gritlm/gritlm.cpp +++ b/examples/gritlm/gritlm.cpp @@ -75,7 +75,7 @@ static std::vector> encode(llama_context * ctx, const std::ve } std::vector emb_norm(emb_unorm.size()); - common_embd_normalize(emb_unorm.data(), emb_norm.data(), n_embd); + common_embd_normalize(emb_unorm.data(), emb_norm.data(), n_embd, 2); result.push_back(emb_norm); #ifdef GRIT_DEBUG diff --git a/examples/retrieval/retrieval.cpp b/examples/retrieval/retrieval.cpp index 23ff4db27a420..a5c6fe7e58523 100644 --- a/examples/retrieval/retrieval.cpp +++ b/examples/retrieval/retrieval.cpp @@ -107,7 +107,7 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu } float * out = output + batch.seq_id[i][0] * n_embd; - common_embd_normalize(embd, out, n_embd); + common_embd_normalize(embd, out, n_embd, 2); } } diff --git a/examples/server/server.cpp b/examples/server/server.cpp index bb0cd1f2b1239..c1c6bf1ea42ac 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -2059,8 +2059,14 @@ struct server_context { continue; } - common_embd_normalize(embd, embd_res.data(), n_embd); - res->embedding.push_back(embd_res); + // normalize only when there is pooling + // TODO: configurable + if (llama_pooling_type(slot.ctx) != LLAMA_POOLING_TYPE_NONE) { + common_embd_normalize(embd, embd_res.data(), n_embd, 2); + res->embedding.push_back(embd_res); + } else { + res->embedding.push_back({ embd, embd + n_embd }); + } } SLT_DBG(slot, "%s", "sending embeddings\n"); diff --git a/examples/server/tests/unit/test_embedding.py b/examples/server/tests/unit/test_embedding.py index 32088d0a044d9..b5348120a74c6 100644 --- a/examples/server/tests/unit/test_embedding.py +++ b/examples/server/tests/unit/test_embedding.py @@ -87,6 +87,10 @@ def test_embedding_pooling_none(): assert 'embedding' in res.body[0] assert len(res.body[0]['embedding']) == 3 + # make sure embedding vector is not normalized + for x in res.body[0]['embedding']: + assert abs(sum([x ** 2 for x in x]) - 1) > EPSILON + def test_embedding_pooling_none_oai(): global server @@ -95,6 +99,7 @@ def test_embedding_pooling_none_oai(): res = server.make_request("POST", "/v1/embeddings", data={ "input": "hello hello hello", }) + # /v1/embeddings does not support pooling type 'none' assert res.status_code == 400 From 3a7c001fe3469847b37721f437122d097878e584 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 17 Dec 2024 16:12:15 +0200 Subject: [PATCH 08/12] server : update readme ggml-ci --- examples/server/README.md | 42 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 41 insertions(+), 1 deletion(-) diff --git a/examples/server/README.md b/examples/server/README.md index 0787bfc306cfb..22ef92dae3972 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -763,6 +763,8 @@ curl http://localhost:8080/v1/chat/completions \ ### POST `/v1/embeddings`: OpenAI-compatible embeddings API +This endpoint requires that the model uses a pooling different than type `none`. + *Options:* See [OpenAI Embeddings API documentation](https://platform.openai.com/docs/api-reference/embeddings). @@ -795,7 +797,45 @@ See [OpenAI Embeddings API documentation](https://platform.openai.com/docs/api-r }' ``` -When `--pooling none` is used, the server will output an array of embeddings - one for each token in the input. +### POST `/embeddings`: non-OpenAI-compatible embeddings API + +This endpoint supports `--pooling none`. When used, the responses will contain the embeddings for all input tokens. +Note that the response format is slightly different than `/v1/embeddings` - it does not have the `"data"` sub-tree and the +embeddings are always returned as vector of vectors. + +*Options:* + +Same as the `/v1/embeddings` endpoint. + +*Examples:* + +Same as the `/v1/embeddings` endpoint. + +**Response format** + +```json +[ + { + "index": 0, + "embedding": [ + [ ... embeddings for token 0 ... ], + [ ... embeddings for token 1 ... ], + [ ... ] + [ ... embeddings for token N-1 ... ], + ] + }, + ... + { + "index": P, + "embedding": [ + [ ... embeddings for token 0 ... ], + [ ... embeddings for token 1 ... ], + [ ... ] + [ ... embeddings for token N-1 ... ], + ] + } +] +``` ### GET `/slots`: Returns the current slots processing state From 87df60166d9baa1ea1a48329ba6edfbdb1b038b1 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 18 Dec 2024 11:13:29 +0200 Subject: [PATCH 09/12] server : fixes --- examples/server/server.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index c1c6bf1ea42ac..958860bfb8a47 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1018,7 +1018,6 @@ struct server_slot { n_prompt_tokens = 0; last_nl_pos = 0; generated_text = ""; - generated_tokens = {}; has_new_line = false; truncated = false; stop = STOP_TYPE_NONE; @@ -3733,7 +3732,7 @@ int main(int argc, char ** argv) { task.prompt_tokens = std::move(tokenized_prompts[i]); // OAI-compat - task.params.oaicompat = oaicompat;; + task.params.oaicompat = oaicompat; tasks.push_back(task); } From 2a5510ed824a20e9e7e4f07570bcf04c26fb8be5 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 18 Dec 2024 11:33:46 +0200 Subject: [PATCH 10/12] tests : update server tests ggml-ci --- examples/server/server.cpp | 10 +++++----- examples/server/tests/unit/test_embedding.py | 21 ++++++++++---------- 2 files changed, 16 insertions(+), 15 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 958860bfb8a47..de1382a141b09 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -744,16 +744,16 @@ struct server_task_result_embd : server_task_result { json to_json_non_oaicompat() { return json { - {"index", index}, - {"embedding", embedding}, - {"tokens_evaluated", n_tokens}, + {"index", index}, + {"embedding", embedding}, }; } json to_json_oaicompat() { return json { - {"index", index}, - {"embedding", embedding[0]}, + {"index", index}, + {"embedding", embedding[0]}, + {"tokens_evaluated", n_tokens}, }; } }; diff --git a/examples/server/tests/unit/test_embedding.py b/examples/server/tests/unit/test_embedding.py index b5348120a74c6..e32d745829605 100644 --- a/examples/server/tests/unit/test_embedding.py +++ b/examples/server/tests/unit/test_embedding.py @@ -48,7 +48,7 @@ def test_embedding_multiple(): @pytest.mark.parametrize( - "content,is_multi_prompt", + "input,is_multi_prompt", [ # single prompt ("string", False), @@ -61,19 +61,20 @@ def test_embedding_multiple(): ([[12, 34, 56], [12, "string", 34, 56]], True), ] ) -def test_embedding_mixed_input(content, is_multi_prompt: bool): +def test_embedding_mixed_input(input, is_multi_prompt: bool): global server server.start() - res = server.make_request("POST", "/embeddings", data={"content": content}) + res = server.make_request("POST", "/v1/embeddings", data={"input": input}) assert res.status_code == 200 + data = res.body['data'] if is_multi_prompt: - assert len(res.body) == len(content) - for d in res.body: + assert len(data) == len(input) + for d in data: assert 'embedding' in d assert len(d['embedding']) > 1 else: - assert 'embedding' in res.body - assert len(res.body['embedding']) > 1 + assert 'embedding' in data[0] + assert len(data[0]['embedding']) > 1 def test_embedding_pooling_none(): @@ -85,7 +86,7 @@ def test_embedding_pooling_none(): }) assert res.status_code == 200 assert 'embedding' in res.body[0] - assert len(res.body[0]['embedding']) == 3 + assert len(res.body[0]['embedding']) == 5 # 3 text tokens + 2 special # make sure embedding vector is not normalized for x in res.body[0]['embedding']: @@ -172,7 +173,7 @@ def test_same_prompt_give_same_result(): def test_embedding_usage_single(content, n_tokens): global server server.start() - res = server.make_request("POST", "/embeddings", data={"input": content}) + res = server.make_request("POST", "/v1/embeddings", data={"input": content}) assert res.status_code == 200 assert res.body['usage']['prompt_tokens'] == res.body['usage']['total_tokens'] assert res.body['usage']['prompt_tokens'] == n_tokens @@ -181,7 +182,7 @@ def test_embedding_usage_single(content, n_tokens): def test_embedding_usage_multiple(): global server server.start() - res = server.make_request("POST", "/embeddings", data={ + res = server.make_request("POST", "/v1/embeddings", data={ "input": [ "I believe the meaning of life is", "I believe the meaning of life is", From 600cebc9a8c3ad49e17b8945090d1ac3b94420a7 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 18 Dec 2024 11:55:28 +0200 Subject: [PATCH 11/12] server : update readme [no ci] --- examples/server/README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/server/README.md b/examples/server/README.md index 22ef92dae3972..d006a8d37cf6f 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -763,7 +763,7 @@ curl http://localhost:8080/v1/chat/completions \ ### POST `/v1/embeddings`: OpenAI-compatible embeddings API -This endpoint requires that the model uses a pooling different than type `none`. +This endpoint requires that the model uses a pooling different than type `none`. The embeddings are normalized using the Eucledian norm. *Options:* @@ -799,9 +799,9 @@ See [OpenAI Embeddings API documentation](https://platform.openai.com/docs/api-r ### POST `/embeddings`: non-OpenAI-compatible embeddings API -This endpoint supports `--pooling none`. When used, the responses will contain the embeddings for all input tokens. -Note that the response format is slightly different than `/v1/embeddings` - it does not have the `"data"` sub-tree and the -embeddings are always returned as vector of vectors. +This endpoint supports all poolings, including `--pooling none`. When the pooling is `none`, the responses will contain the *unnormalized* embeddings for *all* input tokens. For all other pooling types, only the pooled embeddings are returned, normalized using Euclidian norm. + +Note that the response format of this endpoint is different from `/v1/embeddings`. *Options:* From 2dcdd483d4928f0021f9536a13823bff441fb361 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 18 Dec 2024 12:43:04 +0200 Subject: [PATCH 12/12] server : remove rebase artifact --- examples/server/server.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index de1382a141b09..5ed4e8d274428 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -704,7 +704,6 @@ struct server_task_result_cmpl_partial : server_task_result { {"delta", json { {"content", content}, - {"tokens", tokens} }}, }}); }