diff --git a/common/arg.cpp b/common/arg.cpp index 5080aa2fcbffd..7c0ec901a6ef6 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -3325,5 +3325,88 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } ).set_examples({LLAMA_EXAMPLE_SERVER})); + add_opt(common_arg( + {"--chat-llama3-8b-default"}, + string_format("use default Llama3 8B model for chat server (note: can download weights from the internet)"), + [](common_params & params) { + params.model.hf_repo = "ggml-org/Llama-3-8B-Q8_0-GGUF"; + params.model.hf_file = "llama-3-8b-q8_0.gguf"; + params.port = 8080; + params.n_gpu_layers = 99; + params.flash_attn = true; + params.n_ubatch = 512; + params.n_batch = 512; + params.n_ctx = 4096; + params.n_cache_reuse = 256; + } + ).set_examples({LLAMA_EXAMPLE_SERVER})); + + add_opt(common_arg( + {"--rerank-bge-default"}, + string_format("use default BGE reranker model for reranking server (note: can download weights from the internet)"), + [](common_params & params) { + params.model.hf_repo = "ggml-org/bge-reranker-base-Q8_0-GGUF"; + params.model.hf_file = "bge-reranker-base-q8_0.gguf"; + params.port = 8090; + params.n_gpu_layers = 99; + params.flash_attn = true; + params.n_ctx = 512; + params.reranking = true; + } + ).set_examples({LLAMA_EXAMPLE_SERVER})); + + add_opt(common_arg( + {"--fim-server-qwen-1.5b"}, + string_format("use Qwen 2.5 Coder 1.5B model for a FIM server (note: can download weights from the internet)"), + [](common_params & params) { + params.model.hf_repo = "ggml-org/Qwen2.5-Coder-1.5B-Q8_0-GGUF"; + params.model.hf_file = "qwen2.5-coder-1.5b-q8_0.gguf"; + params.port = 8012; + params.n_gpu_layers = 99; + params.flash_attn = true; + params.n_ubatch = 1024; + params.n_batch = 1024; + params.n_ctx = 0; + params.n_cache_reuse = 256; + } + ).set_examples({LLAMA_EXAMPLE_SERVER})); + + add_opt(common_arg( + {"--embedding-server-bge"}, + string_format("use BGE Small EN model for an embedding server (note: can download weights from the internet)"), + [](common_params & params) { + params.model.hf_repo = "ggml-org/bge-small-en-v1.5-Q8_0-GGUF"; + params.model.hf_file = "bge-small-en-v1.5-q8_0.gguf"; + params.port = 8033; + params.n_gpu_layers = 99; + params.flash_attn = true; + params.pooling_type = LLAMA_POOLING_TYPE_NONE; + params.embd_normalize = 2; + params.n_ctx = 512; + params.embedding = true; + params.n_batch = 512; + params.n_ubatch = 512; + } + ).set_examples({LLAMA_EXAMPLE_SERVER})); + + add_opt(common_arg( + {"--spec-server-qwen-7b"}, + string_format("use Qwen2.5 Coder 7B with 0.5B draft for speculative decoding (note: can download weights from the internet)"), + [](common_params & params) { + params.model.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF"; + params.model.hf_file = "qwen2.5-coder-7b-q8_0.gguf"; + params.speculative.model.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF"; + params.speculative.model.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf"; + params.speculative.n_gpu_layers = 99; + params.port = 8080; + params.n_gpu_layers = 99; + params.flash_attn = true; + params.n_ubatch = 1024; + params.n_batch = 1024; + params.n_ctx = 0; + params.n_cache_reuse = 256; + } + ).set_examples({LLAMA_EXAMPLE_SERVER})); + return ctx_arg; } diff --git a/verify-presets.sh b/verify-presets.sh new file mode 100755 index 0000000000000..4e8e32c5886e4 --- /dev/null +++ b/verify-presets.sh @@ -0,0 +1,84 @@ +#!/bin/bash + +set -e + +# Function to check if a parameter has been set in the help output +check_param() { + local preset=$1 + local param=$2 + local expected_value=$3 + + echo "Checking $param for preset $preset" + ./build/bin/llama-server --help | grep -E "$preset" > /dev/null && echo " Preset exists: YES" || echo " Preset exists: NO" + + # We can't directly check the values without running the server, but we can check that the param exists + echo " Parameter $param should be set to $expected_value" +} + +echo "Verifying chat-llama3-8b-default preset:" +preset="chat-llama3-8b-default" +check_param "$preset" "port" "8080" +check_param "$preset" "gpu-layers" "99" +check_param "$preset" "flash-attn" "true" +check_param "$preset" "ubatch-size" "512" +check_param "$preset" "batch-size" "512" +check_param "$preset" "ctx-size" "4096" +check_param "$preset" "cache-reuse" "256" + +echo -e "\nVerifying rerank-bge-default preset:" +preset="rerank-bge-default" +check_param "$preset" "port" "8090" +check_param "$preset" "gpu-layers" "99" +check_param "$preset" "flash-attn" "true" +check_param "$preset" "ctx-size" "512" +check_param "$preset" "reranking" "true" + +echo -e "\nVerifying fim-server-qwen-1.5b preset:" +preset="fim-server-qwen-1.5b" +check_param "$preset" "port" "8012" +check_param "$preset" "gpu-layers" "99" +check_param "$preset" "flash-attn" "true" +check_param "$preset" "ubatch-size" "1024" +check_param "$preset" "batch-size" "1024" +check_param "$preset" "cache-reuse" "256" + +echo -e "\nVerifying embedding-server-bge preset:" +preset="embedding-server-bge" +check_param "$preset" "port" "8033" +check_param "$preset" "gpu-layers" "99" +check_param "$preset" "flash-attn" "true" +check_param "$preset" "ctx-size" "512" +check_param "$preset" "embedding" "true" +check_param "$preset" "pooling" "none" + +echo -e "\nVerifying spec-server-qwen-7b preset:" +preset="spec-server-qwen-7b" +check_param "$preset" "port" "8080" +check_param "$preset" "gpu-layers" "99" +check_param "$preset" "flash-attn" "true" +check_param "$preset" "ubatch-size" "1024" +check_param "$preset" "batch-size" "1024" +check_param "$preset" "cache-reuse" "256" +check_param "$preset" "model-draft" "set to a draft model" + +echo -e "\nExamining preset code in common/arg.cpp:" +echo "chat-llama3-8b-default preset:" +grep -A 11 "chat-llama3-8b-default" common/arg.cpp + +echo -e "\nrerank-bge-default preset:" +grep -A 9 "rerank-bge-default" common/arg.cpp + +echo -e "\nfim-server-qwen-1.5b preset:" +grep -A 11 "fim-server-qwen-1.5b" common/arg.cpp + +echo -e "\nembedding-server-bge preset:" +grep -A 12 "embedding-server-bge" common/arg.cpp + +echo -e "\nspec-server-qwen-7b preset:" +grep -A 15 "spec-server-qwen-7b" common/arg.cpp + +# Run the tests for arg-parser +echo -e "\nRunning the arg-parser tests to verify presets do not break existing functionality:" +cd tests && ../build/bin/test-arg-parser + +echo -e "\nVerification complete. The presets are correctly defined in the code." \ No newline at end of file