Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit f2d14b8

Browse files
committedNov 14, 2024
synced to llama.cpp tag:b4079 short:4a8ccb3
1 parent fb9a644 commit f2d14b8

File tree

5 files changed

+176
-56
lines changed

5 files changed

+176
-56
lines changed
 

‎src/cyllama/cyllama.pyx

+49-26
Original file line numberDiff line numberDiff line change
@@ -4023,42 +4023,65 @@ cdef class LlamaContext:
40234023
"""
40244024
cdef int n_vocab = self.model.n_vocab
40254025
cdef float * logits = llama_cpp.llama_get_logits(self.ptr)
4026+
if logits is NULL:
4027+
# TODO: should one just return [] here?
4028+
raise ValueError('no logits available')
40264029
cdef vector[float] vec
40274030
for i in range(n_vocab):
40284031
vec.push_back(logits[i])
40294032
return vec
40304033

4031-
# def get_logits_ith(self, int i):
4032-
# """Logits for the ith token. For positive indices,
4034+
def get_logits_ith(self, int i):
4035+
"""Logits for the ith token. For positive indices,
40334036
4034-
# Equivalent to:
4035-
# llama_get_logits(ctx) + ctx->output_ids[i]*n_vocab
4036-
# Negative indicies can be used to access logits in reverse order, -1 is the last logit.
4037-
# returns NULL for invalid ids.
4038-
# """
4039-
# cdef float * logits = llama_get_logits_ith( llama_context * ctx, int32_t i)
4040-
4041-
# def get_embeddings(self):
4042-
# """Get all output token embeddings.
4037+
Equivalent to:
4038+
llama_get_logits(ctx) + ctx->output_ids[i]*n_vocab
4039+
Negative indicies can be used to access logits in reverse order, -1 is the last logit.
4040+
returns NULL for invalid ids.
4041+
"""
4042+
cdef int n_vocab = self.model.n_vocab
4043+
cdef float * logits = llama_cpp.llama_get_logits_ith(self.ptr, i)
4044+
cdef vector[float] vec
4045+
if logits is NULL:
4046+
raise ValueError(f"{i} is an invalid id")
4047+
for i in range(n_vocab):
4048+
vec.push_back(logits[i])
4049+
return vec
40434050

4044-
# when pooling_type == LLAMA_POOLING_TYPE_NONE or when using a generative model,
4045-
# the embeddings for which llama_batch.logits[i] != 0 are stored contiguously
4046-
# in the order they have appeared in the batch.
4047-
# shape: [n_outputs*n_embd]
4048-
# Otherwise, returns NULL.
4049-
# """
4050-
# cdef float * embds = llama_cpp.llama_get_embeddings(self.ptr)
4051+
def get_embeddings(self):
4052+
"""Get all output token embeddings.
40514053
4054+
when pooling_type == LLAMA_POOLING_TYPE_NONE or when using a generative model,
4055+
the embeddings for which llama_batch.logits[i] != 0 are stored contiguously
4056+
in the order they have appeared in the batch.
4057+
shape: [n_outputs * n_embd]
4058+
Otherwise, returns NULL.
4059+
"""
4060+
cdef int n_embd = self.model.n_embd
4061+
cdef float * embds = llama_cpp.llama_get_embeddings(self.ptr)
4062+
cdef vector[float] vec
4063+
if embds is NULL:
4064+
# TODO: should one just return [] here?
4065+
raise ValueError('no embeddings available')
4066+
for i in range(n_embd):
4067+
vec.push_back(embds[i])
4068+
return vec
40524069

4053-
# def get_embeddings_ith(self, int i):
4054-
# """Get the embeddings for the ith token. For positive indices, Equivalent to:
4070+
def get_embeddings_ith(self, int i):
4071+
"""Get the embeddings for the ith token. For positive indices, Equivalent to:
40554072
4056-
# llama_get_embeddings(ctx) + ctx->output_ids[i]*n_embd
4057-
# Negative indicies can be used to access embeddings in reverse order, -1 is the last embedding.
4058-
# returns NULL for invalid ids.
4059-
# """
4060-
# cdef float * embds = llama_cpp.llama_get_embeddings_ith(self.ptr, i)
4061-
4073+
llama_get_embeddings(ctx) + ctx->output_ids[i]*n_embd
4074+
Negative indicies can be used to access embeddings in reverse order, -1 is the last embedding.
4075+
returns NULL for invalid ids.
4076+
"""
4077+
cdef int n_embd = self.model.n_embd
4078+
cdef float * embds = llama_cpp.llama_get_embeddings_ith(self.ptr, i)
4079+
cdef vector[float] vec
4080+
if embds is NULL:
4081+
raise ValueError(f"{i} is an invalid id")
4082+
for i in range(n_embd):
4083+
vec.push_back(embds[i])
4084+
return vec
40624085

40634086
# def get_embeddings_seq(self, int seq_id):
40644087
# """Get the embeddings for a sequence id

‎src/cyllama/llama_cpp.pxd

+2-2
Original file line numberDiff line numberDiff line change
@@ -914,13 +914,13 @@ cdef extern from "llama.h":
914914
# Processes a batch of tokens with the ecoder part of the encoder-decoder model.
915915
# Stores the encoder output internally for later use by the decoder cross-attention layers.
916916
# 0 - success
917-
# < 0 - error
917+
# < 0 - error. the KV cache state is restored to the state before this call
918918
cdef int32_t llama_encode(llama_context * ctx, llama_batch batch)
919919

920920
# Positive return values does not mean a fatal error, but rather a warning.
921921
# 0 - success
922922
# 1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
923-
# < 0 - error
923+
# < 0 - error. the KV cache state is restored to the state before this call
924924
cdef int32_t llama_decode(llama_context * ctx, llama_batch batch)
925925

926926
# Set the number of threads used for decoding

‎tests/scratch.py

+122-22
Original file line numberDiff line numberDiff line change
@@ -7,32 +7,132 @@
77

88
model_path = str(ROOT / 'models' / 'Llama-3.2-1B-Instruct-Q8_0.gguf')
99

10-
cy.llama_backend_init()
11-
model = cy.LlamaModel(model_path)
12-
ctx = cy.LlamaContext(model)
13-
assert ctx.n_ctx > 0
14-
# cy.llama_backend_free()
10+
params = cy.CommonParams()
11+
params.model = model_path
12+
params.prompt = "When did the universe begin?"
13+
params.n_predict = 32
14+
params.n_ctx = 512
15+
params.cpuparams.n_threads = 4
1516

17+
# total length of the sequence including the prompt
18+
n_predict: int = params.n_predict
1619

17-
# params = cy.CommonParams()
18-
# cy.common_init()
20+
# init LLM
21+
cy.llama_backend_init()
22+
cy.llama_numa_init(params.numa)
1923

20-
# params.model = model_path
21-
# params.prompt = "When did the universe begin?"
22-
# params.n_predict = 32
23-
# params.n_ctx = 512
24-
# params.cpuparams.n_threads = 4
24+
# initialize the model
2525

26-
# # total length of the sequence including the prompt
27-
# n_predict: int = params.n_predict
26+
model_params = cy.common_model_params_to_llama(params)
2827

29-
# # init LLM
30-
# cy.llama_backend_init()
31-
# cy.llama_numa_init(params.numa)
28+
# set local test model
29+
params.model = model_path
3230

33-
# # load the model and apply lora adapter, if any
34-
# llama_init = cy.CommonInitResult(params)
35-
# model = llama_init.model;
36-
# ctx = llama_init.context;
31+
model = cy.LlamaModel(path_model=params.model, params=model_params)
32+
33+
# initialize the context
34+
ctx_params = cy.common_context_params_to_llama(params)
35+
ctx = cy.LlamaContext(model=model, params=ctx_params)
36+
37+
38+
# build sampler chain
39+
sparams = cy.LlamaSamplerChainParams()
40+
sparams.no_perf = False
41+
42+
smplr = cy.LlamaSampler(sparams)
43+
44+
smplr.add_greedy()
45+
46+
47+
# tokenize the prompt
48+
49+
tokens_list: list[int] = cy.common_tokenize(ctx, params.prompt, True)
50+
51+
n_ctx: int = ctx.n_ctx
52+
53+
n_kv_req: int = len(tokens_list) + (n_predict - len(tokens_list))
54+
55+
print("n_predict = %d, n_ctx = %d, n_kv_req = %d" % (n_predict, n_ctx, n_kv_req))
56+
57+
if (n_kv_req > n_ctx):
58+
raise SystemExit(
59+
"error: n_kv_req > n_ctx, the required KV cache size is not big enough\n"
60+
"either reduce n_predict or increase n_ctx.")
61+
62+
# print the prompt token-by-token
63+
print()
64+
prompt=""
65+
for i in tokens_list:
66+
prompt += cy.common_token_to_piece(ctx, i)
67+
print(prompt)
68+
69+
# create a llama_batch with size 512
70+
# we use this object to submit token data for decoding
71+
72+
# create batch
73+
batch = cy.LlamaBatch(n_tokens=512, embd=0, n_seq_max=1)
74+
75+
# evaluate the initial prompt
76+
for i, token in enumerate(tokens_list):
77+
cy.common_batch_add(batch, token, i, [0], False)
78+
79+
# llama_decode will output logits only for the last token of the prompt
80+
# batch.logits[batch.n_tokens - 1] = True
81+
batch.set_last_logits_to_true()
82+
83+
# logits = batch.get_logits()
84+
85+
ctx.decode(batch)
86+
87+
# main loop
88+
89+
# n_cur: int = batch.n_tokens
90+
# n_decode: int = 0
91+
92+
# t_main_start: int = cy.ggml_time_us()
93+
94+
# result: str = ""
95+
96+
# while (n_cur <= n_predict):
97+
# # sample the next token
98+
99+
# if True:
100+
# new_token_id = smplr.sample(ctx, batch.n_tokens - 1)
101+
102+
# # print("new_token_id: ", new_token_id)
103+
104+
# smplr.accept(new_token_id)
105+
106+
# # is it an end of generation?
107+
# if (model.token_is_eog(new_token_id) or n_cur == n_predict):
108+
# print()
109+
# break
110+
111+
# result += cy.common_token_to_piece(ctx, new_token_id)
112+
113+
# # prepare the next batch
114+
# cy.common_batch_clear(batch);
115+
116+
# # push this new token for next evaluation
117+
# cy.common_batch_add(batch, new_token_id, n_cur, [0], True)
118+
119+
# n_decode += 1
120+
121+
# n_cur += 1
122+
123+
# # evaluate the current batch with the transformer model
124+
# ctx.decode(batch)
125+
126+
127+
# print(result)
128+
129+
# print()
130+
131+
# t_main_end: int = cy.ggml_time_us()
132+
133+
# print("decoded %d tokens in %.2f s, speed: %.2f t/s" %
134+
# (n_decode, (t_main_end - t_main_start) / 1000000.0, n_decode / ((t_main_end - t_main_start) / 1000000.0)))
135+
# print()
136+
137+
cy.llama_backend_free()
37138

38-
# cy.llama_backend_free()

‎tests/test_platform.cpp

+1-4
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,7 @@
77

88

99
int main() {
10-
// path to the model gguf file
1110
std::string model_path = "models/Llama-3.2-1B-Instruct-Q8_0.gguf";
12-
// prompt to generate text from
1311
std::string prompt = "Is Mathematics invented or discovered?";
1412
// number of layers to offload to the GPU
1513
int ngl = 99;
@@ -18,12 +16,11 @@ int main() {
1816

1917

2018
// initialize the model
21-
2219
llama_model_params model_params = llama_model_default_params();
2320
model_params.n_gpu_layers = ngl;
24-
2521
llama_model * model = llama_load_model_from_file(model_path.c_str(), model_params);
2622

23+
// model properties
2724
uint64_t n_params = llama_model_n_params(model);
2825
uint64_t size = llama_model_size(model);
2926

‎thirdparty/llama.cpp/include/llama.h

+2-2
Original file line numberDiff line numberDiff line change
@@ -797,15 +797,15 @@ extern "C" {
797797
// Processes a batch of tokens with the ecoder part of the encoder-decoder model.
798798
// Stores the encoder output internally for later use by the decoder cross-attention layers.
799799
// 0 - success
800-
// < 0 - error
800+
// < 0 - error. the KV cache state is restored to the state before this call
801801
LLAMA_API int32_t llama_encode(
802802
struct llama_context * ctx,
803803
struct llama_batch batch);
804804

805805
// Positive return values does not mean a fatal error, but rather a warning.
806806
// 0 - success
807807
// 1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
808-
// < 0 - error
808+
// < 0 - error. the KV cache state is restored to the state before this call
809809
LLAMA_API int32_t llama_decode(
810810
struct llama_context * ctx,
811811
struct llama_batch batch);

0 commit comments

Comments
 (0)
Please sign in to comment.