diff --git a/outlines/models/llamacpp.py b/outlines/models/llamacpp.py index a982b080c..aa260a457 100644 --- a/outlines/models/llamacpp.py +++ b/outlines/models/llamacpp.py @@ -166,7 +166,9 @@ def prepare_generation_parameters( # Somehow `llama-cpp-python` generates `max_tokens + 1` tokens if "max_tokens" not in llama_cpp_params: - if max_tokens is not None: + if max_tokens is None: + llama_cpp_params["max_tokens"] = -1 # indicates unlimited tokens + else: llama_cpp_params["max_tokens"] = max_tokens - 1 else: llama_cpp_params["max_tokens"] = llama_cpp_params["max_tokens"] - 1 diff --git a/tests/generate/test_integration_llamacpp.py b/tests/generate/test_integration_llamacpp.py index fcd2bfda9..452d22c36 100644 --- a/tests/generate/test_integration_llamacpp.py +++ b/tests/generate/test_integration_llamacpp.py @@ -356,3 +356,22 @@ def test_tokenizer_vocabulary_decode_sanity(): ] ) assert decoded_nl_token == vocab_nl_token + + +def test_no_length_constraint_when_unset(): + """Assert that models.llamacpp doesn't have an implicit max_tokens preventing full sequence generation""" + import llama_cpp + + model = models.llamacpp( + repo_id="M4-ai/TinyMistral-248M-v2-Instruct-GGUF", + filename="TinyMistral-248M-v2-Instruct.Q4_K_M.gguf", + tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained( + "Locutusque/TinyMistral-248M-Instruct" + ), + ) + + long_pattern = "abcdefg" * 10 + generator = generate.regex(model, long_pattern) + + output = generator("a") + assert re.match(long_pattern, output)