synced to llama.cpp tag:b4079 short:4a8ccb3

shakfu · shakfu · commit f2d14b88b3ea · 2024-11-14T18:48:07.000+03:00
diff --git a/src/cyllama/cyllama.pyx b/src/cyllama/cyllama.pyx
@@ -4023,42 +4023,65 @@ cdef class LlamaContext:
         """
         cdef int n_vocab = self.model.n_vocab
         cdef float * logits = llama_cpp.llama_get_logits(self.ptr)
+        if logits is NULL:
+            # TODO: should one just return [] here?
+            raise ValueError('no logits available')
         cdef vector[float] vec
         for i in range(n_vocab):
             vec.push_back(logits[i])
         return vec
 
-    # def get_logits_ith(self, int i):
-    #     """Logits for the ith token. For positive indices, 
+    def get_logits_ith(self, int i):
+        """Logits for the ith token. For positive indices, 
 
-    #     Equivalent to:
-    #     llama_get_logits(ctx) + ctx->output_ids[i]*n_vocab
-    #     Negative indicies can be used to access logits in reverse order, -1 is the last logit.
-    #     returns NULL for invalid ids.
-    #     """
-    #     cdef float * logits = llama_get_logits_ith( llama_context * ctx, int32_t i)
-
-    # def get_embeddings(self):
-    #     """Get all output token embeddings.
+        Equivalent to:
+        llama_get_logits(ctx) + ctx->output_ids[i]*n_vocab
+        Negative indicies can be used to access logits in reverse order, -1 is the last logit.
+        returns NULL for invalid ids.
+        """
+        cdef int n_vocab = self.model.n_vocab
+        cdef float * logits = llama_cpp.llama_get_logits_ith(self.ptr, i)
+        cdef vector[float] vec
+        if logits is NULL:
+            raise ValueError(f"{i} is an invalid id")
+        for i in range(n_vocab):
+            vec.push_back(logits[i])
+        return vec
 
-    #     when pooling_type == LLAMA_POOLING_TYPE_NONE or when using a generative model,
-    #     the embeddings for which llama_batch.logits[i] != 0 are stored contiguously
-    #     in the order they have appeared in the batch.
-    #     shape: [n_outputs*n_embd]
-    #     Otherwise, returns NULL.
-    #     """
-    #     cdef float * embds = llama_cpp.llama_get_embeddings(self.ptr)
+    def get_embeddings(self):
+        """Get all output token embeddings.
 
+        when pooling_type == LLAMA_POOLING_TYPE_NONE or when using a generative model,
+        the embeddings for which llama_batch.logits[i] != 0 are stored contiguously
+        in the order they have appeared in the batch.
+        shape: [n_outputs * n_embd]
+        Otherwise, returns NULL.
+        """
+        cdef int n_embd = self.model.n_embd
+        cdef float * embds = llama_cpp.llama_get_embeddings(self.ptr)
+        cdef vector[float] vec
+        if embds is NULL:
+            # TODO: should one just return [] here?
+            raise ValueError('no embeddings available')
+        for i in range(n_embd):
+            vec.push_back(embds[i])
+        return vec
 
-    # def get_embeddings_ith(self, int i):
-    #     """Get the embeddings for the ith token. For positive indices, Equivalent to:
+    def get_embeddings_ith(self, int i):
+        """Get the embeddings for the ith token. For positive indices, Equivalent to:
         
-    #     llama_get_embeddings(ctx) + ctx->output_ids[i]*n_embd
-    #     Negative indicies can be used to access embeddings in reverse order, -1 is the last embedding.
-    #     returns NULL for invalid ids.
-    #     """
-    #     cdef float * embds = llama_cpp.llama_get_embeddings_ith(self.ptr, i)
-
+        llama_get_embeddings(ctx) + ctx->output_ids[i]*n_embd
+        Negative indicies can be used to access embeddings in reverse order, -1 is the last embedding.
+        returns NULL for invalid ids.
+        """
+        cdef int n_embd = self.model.n_embd
+        cdef float * embds = llama_cpp.llama_get_embeddings_ith(self.ptr, i)
+        cdef vector[float] vec
+        if embds is NULL:
+            raise ValueError(f"{i} is an invalid id")
+        for i in range(n_embd):
+            vec.push_back(embds[i])
+        return vec
 
     # def get_embeddings_seq(self, int seq_id):
     #     """Get the embeddings for a sequence id
diff --git a/src/cyllama/llama_cpp.pxd b/src/cyllama/llama_cpp.pxd
@@ -914,13 +914,13 @@ cdef extern from "llama.h":
     # Processes a batch of tokens with the ecoder part of the encoder-decoder model.
     # Stores the encoder output internally for later use by the decoder cross-attention layers.
     #   0 - success
-    # < 0 - error
+    # < 0 - error. the KV cache state is restored to the state before this call
     cdef int32_t llama_encode(llama_context * ctx, llama_batch batch)
 
     # Positive return values does not mean a fatal error, but rather a warning.
     #   0 - success
     #   1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
-    # < 0 - error
+    # < 0 - error. the KV cache state is restored to the state before this call
     cdef int32_t llama_decode(llama_context * ctx, llama_batch batch)
 
     # Set the number of threads used for decoding
diff --git a/tests/scratch.py b/tests/scratch.py
@@ -7,32 +7,132 @@
 
 model_path = str(ROOT / 'models' / 'Llama-3.2-1B-Instruct-Q8_0.gguf')
 
-cy.llama_backend_init()    
-model = cy.LlamaModel(model_path)
-ctx = cy.LlamaContext(model)
-assert ctx.n_ctx > 0
-# cy.llama_backend_free()
+params = cy.CommonParams()
+params.model = model_path
+params.prompt = "When did the universe begin?"
+params.n_predict = 32
+params.n_ctx = 512
+params.cpuparams.n_threads = 4
 
+# total length of the sequence including the prompt
+n_predict: int = params.n_predict
 
-# params = cy.CommonParams()
-# cy.common_init()
+# init LLM
+cy.llama_backend_init()
+cy.llama_numa_init(params.numa)
 
-# params.model = model_path
-# params.prompt = "When did the universe begin?"
-# params.n_predict = 32
-# params.n_ctx = 512
-# params.cpuparams.n_threads = 4
+# initialize the model
 
-# # total length of the sequence including the prompt
-# n_predict: int = params.n_predict
+model_params = cy.common_model_params_to_llama(params)
 
-# # init LLM
-# cy.llama_backend_init()
-# cy.llama_numa_init(params.numa)
+# set local test model
+params.model = model_path
 
-# # load the model and apply lora adapter, if any
-# llama_init = cy.CommonInitResult(params)
-# model = llama_init.model;
-# ctx = llama_init.context;
+model = cy.LlamaModel(path_model=params.model, params=model_params)
+
+# initialize the context
+ctx_params = cy.common_context_params_to_llama(params)
+ctx = cy.LlamaContext(model=model, params=ctx_params)
+
+
+# build sampler chain
+sparams = cy.LlamaSamplerChainParams()
+sparams.no_perf = False
+
+smplr = cy.LlamaSampler(sparams)
+
+smplr.add_greedy()
+
+
+# tokenize the prompt
+
+tokens_list: list[int] = cy.common_tokenize(ctx, params.prompt, True)
+
+n_ctx: int = ctx.n_ctx
+
+n_kv_req: int = len(tokens_list) + (n_predict - len(tokens_list))
+
+print("n_predict = %d, n_ctx = %d, n_kv_req = %d" % (n_predict, n_ctx, n_kv_req))
+
+if (n_kv_req > n_ctx):
+    raise SystemExit(
+        "error: n_kv_req > n_ctx, the required KV cache size is not big enough\n"
+        "either reduce n_predict or increase n_ctx.")
+
+# print the prompt token-by-token
+print()
+prompt=""
+for i in tokens_list:
+    prompt += cy.common_token_to_piece(ctx, i)
+print(prompt)
+
+# create a llama_batch with size 512
+# we use this object to submit token data for decoding
+
+# create batch
+batch = cy.LlamaBatch(n_tokens=512, embd=0, n_seq_max=1)
+
+# evaluate the initial prompt
+for i, token in enumerate(tokens_list):
+    cy.common_batch_add(batch, token, i, [0], False)
+
+# llama_decode will output logits only for the last token of the prompt
+# batch.logits[batch.n_tokens - 1] = True
+batch.set_last_logits_to_true()
+
+# logits = batch.get_logits()
+
+ctx.decode(batch)
+
+# main loop
+
+# n_cur: int    = batch.n_tokens
+# n_decode: int = 0
+
+# t_main_start: int = cy.ggml_time_us()
+
+# result: str = ""
+
+# while (n_cur <= n_predict):
+#     # sample the next token
+
+#     if True:
+#         new_token_id = smplr.sample(ctx, batch.n_tokens - 1)
+
+#         # print("new_token_id: ", new_token_id)
+
+#         smplr.accept(new_token_id)
+
+#         # is it an end of generation?
+#         if (model.token_is_eog(new_token_id) or n_cur == n_predict):
+#             print()
+#             break
+
+#         result += cy.common_token_to_piece(ctx, new_token_id)
+
+#         # prepare the next batch
+#         cy.common_batch_clear(batch);
+
+#         # push this new token for next evaluation
+#         cy.common_batch_add(batch, new_token_id, n_cur, [0], True)
+
+#         n_decode += 1
+
+#     n_cur += 1
+
+#     # evaluate the current batch with the transformer model
+#     ctx.decode(batch)
+
+
+# print(result)
+
+# print()
+
+# t_main_end: int = cy.ggml_time_us()
+
+# print("decoded %d tokens in %.2f s, speed: %.2f t/s" %
+#         (n_decode, (t_main_end - t_main_start) / 1000000.0, n_decode / ((t_main_end - t_main_start) / 1000000.0)))
+# print()
+
+cy.llama_backend_free()
 
-# cy.llama_backend_free()
diff --git a/tests/test_platform.cpp b/tests/test_platform.cpp
@@ -7,9 +7,7 @@
 
 
 int main() {
-    // path to the model gguf file
     std::string model_path = "models/Llama-3.2-1B-Instruct-Q8_0.gguf";
-    // prompt to generate text from
     std::string prompt = "Is Mathematics invented or discovered?";
     // number of layers to offload to the GPU
     int ngl = 99;
@@ -18,12 +16,11 @@ int main() {
 
 
     // initialize the model
-
     llama_model_params model_params = llama_model_default_params();
     model_params.n_gpu_layers = ngl;
-
     llama_model * model = llama_load_model_from_file(model_path.c_str(), model_params);
 
+    // model properties
     uint64_t n_params = llama_model_n_params(model);
     uint64_t size = llama_model_size(model);
 
diff --git a/thirdparty/llama.cpp/include/llama.h b/thirdparty/llama.cpp/include/llama.h
@@ -797,15 +797,15 @@ extern "C" {
     // Processes a batch of tokens with the ecoder part of the encoder-decoder model.
     // Stores the encoder output internally for later use by the decoder cross-attention layers.
     //   0 - success
-    // < 0 - error
+    // < 0 - error. the KV cache state is restored to the state before this call
     LLAMA_API int32_t llama_encode(
             struct llama_context * ctx,
               struct llama_batch   batch);
 
     // Positive return values does not mean a fatal error, but rather a warning.
     //   0 - success
     //   1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
-    // < 0 - error
+    // < 0 - error. the KV cache state is restored to the state before this call
     LLAMA_API int32_t llama_decode(
             struct llama_context * ctx,
               struct llama_batch   batch);