From a57b99b07b40d1082f69a8fc5b968d56bc3636bc Mon Sep 17 00:00:00 2001 From: Xuye Qin Date: Fri, 24 Jan 2025 16:52:57 +0800 Subject: [PATCH] FEAT: support deepseek-r1-distill-qwen (#2781) --- README.md | 11 +- README_ja_JP.md | 14 +- README_zh_CN.md | 14 +- doc/source/getting_started/installation.rst | 2 +- .../builtin/llm/deepseek-r1-distill-qwen.rst | 303 ++++++++++++++++++ doc/source/models/builtin/llm/index.rst | 7 + doc/source/user_guide/backends.rst | 2 +- xinference/core/chat_interface.py | 7 +- xinference/deploy/docker/requirements.txt | 2 +- xinference/model/llm/llm_family.json | 221 +++++++++++++ .../model/llm/llm_family_modelscope.json | 188 +++++++++++ xinference/model/llm/vllm/core.py | 2 +- 12 files changed, 748 insertions(+), 25 deletions(-) create mode 100644 doc/source/models/builtin/llm/deepseek-r1-distill-qwen.rst diff --git a/README.md b/README.md index 54481cbe24..9df06e80f1 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,6 @@ [![License](https://img.shields.io/pypi/l/xinference.svg?style=for-the-badge)](https://github.com/xorbitsai/inference/blob/main/LICENSE) [![Build Status](https://img.shields.io/github/actions/workflow/status/xorbitsai/inference/python.yaml?branch=main&style=for-the-badge&label=GITHUB%20ACTIONS&logo=github)](https://actions-badge.atrox.dev/xorbitsai/inference/goto?ref=main) [![Discord](https://img.shields.io/badge/join_Discord-5462eb.svg?logo=discord&style=for-the-badge&logoColor=%23f5f5f5)](https://discord.gg/Xw9tszSkr5) -[![Slack](https://img.shields.io/badge/join_Slack-781FF5.svg?logo=slack&style=for-the-badge)](https://join.slack.com/t/xorbitsio/shared_invite/zt-1o3z9ucdh-RbfhbPVpx7prOVdM1CAuxg) [![Twitter](https://img.shields.io/twitter/follow/xorbitsio?logo=x&style=for-the-badge)](https://twitter.com/xorbitsio)

@@ -177,11 +176,11 @@ Once Xinference is running, there are multiple ways you can try it: via the web ## Getting involved -| Platform | Purpose | -|-----------------------------------------------------------------------------------------------|----------------------------------------------------| -| [Github Issues](https://github.com/xorbitsai/inference/issues) | Reporting bugs and filing feature requests. | -| [Slack](https://join.slack.com/t/xorbitsio/shared_invite/zt-1o3z9ucdh-RbfhbPVpx7prOVdM1CAuxg) | Collaborating with other Xorbits users. | -| [Twitter](https://twitter.com/xorbitsio) | Staying up-to-date on new features. | +| Platform | Purpose | +|-------------------------------------------------------------------------------------------------|---------------------------------------------| +| [Github Issues](https://github.com/xorbitsai/inference/issues) | Reporting bugs and filing feature requests. | +| [Discord](https://discord.gg/Xw9tszSkr5) | Collaborating with other Xinference users. | +| [Twitter](https://twitter.com/xorbitsio) | Staying up-to-date on new features. | ## Citation diff --git a/README_ja_JP.md b/README_ja_JP.md index ff3f0e4861..491c4d913c 100644 --- a/README_ja_JP.md +++ b/README_ja_JP.md @@ -6,7 +6,7 @@ [![PyPI Latest Release](https://img.shields.io/pypi/v/xinference.svg?style=for-the-badge)](https://pypi.org/project/xinference/) [![License](https://img.shields.io/pypi/l/xinference.svg?style=for-the-badge)](https://github.com/xorbitsai/inference/blob/main/LICENSE) [![Build Status](https://img.shields.io/github/actions/workflow/status/xorbitsai/inference/python.yaml?branch=main&style=for-the-badge&label=GITHUB%20ACTIONS&logo=github)](https://actions-badge.atrox.dev/xorbitsai/inference/goto?ref=main) -[![Slack](https://img.shields.io/badge/join_Slack-781FF5.svg?logo=slack&style=for-the-badge)](https://join.slack.com/t/xorbitsio/shared_invite/zt-1o3z9ucdh-RbfhbPVpx7prOVdM1CAuxg) +[![Discord](https://img.shields.io/badge/join_Discord-5462eb.svg?logo=discord&style=for-the-badge&logoColor=%23f5f5f5)](https://discord.gg/Xw9tszSkr5) [![Twitter](https://img.shields.io/twitter/follow/xorbitsio?logo=x&style=for-the-badge)](https://twitter.com/xorbitsio)

@@ -24,7 +24,7 @@ Xorbits Inference(Xinference) は、言語、音声認識、マルチモーダ 研究者、開発者、データサイエンティストを問わず、最先端の AI モデルの可能性を最大限に引き出すことができます。

-👉 Slack コミュニティにご参加ください! +👉 Discord コミュニティにご参加ください!
@@ -98,11 +98,11 @@ Xinferenceが実行されると、Web UI、cURL、コマンドライン、また ## 関与する -| プラットフォーム | 目的 | -|---------------------------------------------------------------------------------------------------|--------------------------------------------------| -| [Github イシュー](https://github.com/xorbitsai/inference/issues) | バグ報告と機能リクエストの提出。 | -| [Slack](https://join.slack.com/t/xorbitsio/shared_invite/zt-1o3z9ucdh-RbfhbPVpx7prOVdM1CAuxg) | 他のXorbitsユーザーとの協力。 | -| [Twitter](https://twitter.com/xorbitsio) | 新機能に関する最新情報の入手。 | +| プラットフォーム | 目的 | +|-------------------------------------------------------------------------------------------------|-----------------------| +| [Github イシュー](https://github.com/xorbitsai/inference/issues) | バグ報告と機能リクエストの提出。 | +| [Discord](https://discord.gg/Xw9tszSkr5) | 他のXinferenceユーザーとの協力。 | +| [Twitter](https://twitter.com/xorbitsio) | 新機能に関する最新情報の入手。 | ## 引用 diff --git a/README_zh_CN.md b/README_zh_CN.md index afd3054b8d..b9cb0d471b 100644 --- a/README_zh_CN.md +++ b/README_zh_CN.md @@ -158,13 +158,13 @@ $ xinference-local ## 参与其中 -| 平台 | 目的 | -|------------------------------------------------------------------------------------------------|--------------------------------------------------| -| [Github 问题](https://github.com/xorbitsai/inference/issues) | 报告错误和提交功能请求。 | -| [Slack](https://join.slack.com/t/xorbitsio/shared_invite/zt-1o3z9ucdh-RbfhbPVpx7prOVdM1CAuxg) | 与其他 Xorbits 用户合作。 | -| [Twitter](https://twitter.com/xorbitsio) | 及时了解新功能。 | -| [微信社群](https://xorbits.cn/assets/images/wechat_work_qr.png) | 与其他 Xorbits 用户交流。 | -| [知乎](https://zhihu.com/org/xorbits) | 了解团队最新的进展。 | +| 平台 | 目的 | +|-------------------------------------------------------------------------------------------------|----------------------| +| [Github 问题](https://github.com/xorbitsai/inference/issues) | 报告错误和提交功能请求。 | +| [Discord](https://discord.gg/Xw9tszSkr5) | 与其他 Xinference 用户合作。 | +| [Twitter](https://twitter.com/xorbitsio) | 及时了解新功能。 | +| [微信社群](https://xorbits.cn/assets/images/wechat_work_qr.png) | 与其他 Xinference 用户交流。 | +| [知乎](https://zhihu.com/org/xorbits) | 了解团队最新的进展。 | ## 引用 diff --git a/doc/source/getting_started/installation.rst b/doc/source/getting_started/installation.rst index ad973c697a..84fc13e153 100644 --- a/doc/source/getting_started/installation.rst +++ b/doc/source/getting_started/installation.rst @@ -44,7 +44,7 @@ Currently, supported models include: - ``codestral-v0.1`` - ``Yi``, ``Yi-1.5``, ``Yi-chat``, ``Yi-1.5-chat``, ``Yi-1.5-chat-16k`` - ``code-llama``, ``code-llama-python``, ``code-llama-instruct`` -- ``deepseek``, ``deepseek-coder``, ``deepseek-chat``, ``deepseek-coder-instruct``, ``deepseek-v2-chat``, ``deepseek-v2-chat-0628``, ``deepseek-v2.5`` +- ``deepseek``, ``deepseek-coder``, ``deepseek-chat``, ``deepseek-coder-instruct``, ``deepseek-r1-distill-qwen``, ``deepseek-v2-chat``, ``deepseek-v2-chat-0628``, ``deepseek-v2.5`` - ``yi-coder``, ``yi-coder-chat`` - ``codeqwen1.5``, ``codeqwen1.5-chat`` - ``qwen2.5``, ``qwen2.5-coder``, ``qwen2.5-instruct``, ``qwen2.5-coder-instruct`` diff --git a/doc/source/models/builtin/llm/deepseek-r1-distill-qwen.rst b/doc/source/models/builtin/llm/deepseek-r1-distill-qwen.rst new file mode 100644 index 0000000000..edaac6ea35 --- /dev/null +++ b/doc/source/models/builtin/llm/deepseek-r1-distill-qwen.rst @@ -0,0 +1,303 @@ +.. _models_llm_deepseek-r1-distill-qwen: + +======================================== +deepseek-r1-distill-qwen +======================================== + +- **Context Length:** 131072 +- **Model Name:** deepseek-r1-distill-qwen +- **Languages:** en, zh +- **Abilities:** chat +- **Description:** deepseek-r1-distill-qwen is distilled from DeepSeek-R1 based on Qwen + +Specifications +^^^^^^^^^^^^^^ + + +Model Spec 1 (pytorch, 1_5 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** pytorch +- **Model Size (in billions):** 1_5 +- **Quantizations:** 4-bit, 8-bit, none +- **Engines**: vLLM, Transformers (vLLM only available for quantization none) +- **Model ID:** deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +- **Model Hubs**: `Hugging Face `__, `ModelScope `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-engine ${engine} --model-name deepseek-r1-distill-qwen --size-in-billions 1_5 --model-format pytorch --quantization ${quantization} + + +Model Spec 2 (awq, 1_5 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** awq +- **Model Size (in billions):** 1_5 +- **Quantizations:** Int4 +- **Engines**: vLLM, Transformers +- **Model ID:** casperhansen/deepseek-r1-distill-qwen-1.5b-awq +- **Model Hubs**: `Hugging Face `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-engine ${engine} --model-name deepseek-r1-distill-qwen --size-in-billions 1_5 --model-format awq --quantization ${quantization} + + +Model Spec 3 (gptq, 1_5 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** gptq +- **Model Size (in billions):** 1_5 +- **Quantizations:** Int4 +- **Engines**: vLLM, Transformers +- **Model ID:** jakiAJK/DeepSeek-R1-Distill-Qwen-1.5B_GPTQ-int4 +- **Model Hubs**: `Hugging Face `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-engine ${engine} --model-name deepseek-r1-distill-qwen --size-in-billions 1_5 --model-format gptq --quantization ${quantization} + + +Model Spec 4 (ggufv2, 1_5 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** ggufv2 +- **Model Size (in billions):** 1_5 +- **Quantizations:** Q2_K, Q2_K_L, Q3_K_M, Q4_K_M, Q5_K_M, Q6_K, Q8_0 +- **Engines**: llama.cpp +- **Model ID:** unsloth/DeepSeek-R1-Distill-Qwen-1.5B-GGUF +- **Model Hubs**: `Hugging Face `__, `ModelScope `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-engine ${engine} --model-name deepseek-r1-distill-qwen --size-in-billions 1_5 --model-format ggufv2 --quantization ${quantization} + + +Model Spec 5 (mlx, 1_5 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** mlx +- **Model Size (in billions):** 1_5 +- **Quantizations:** 3bit, 4bit, 6bit, 8bit, bf16 +- **Engines**: MLX +- **Model ID:** mlx-community/DeepSeek-R1-Distill-Qwen-1.5B-{quantization} +- **Model Hubs**: `Hugging Face `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-engine ${engine} --model-name deepseek-r1-distill-qwen --size-in-billions 1_5 --model-format mlx --quantization ${quantization} + + +Model Spec 6 (pytorch, 7 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** pytorch +- **Model Size (in billions):** 7 +- **Quantizations:** 4-bit, 8-bit, none +- **Engines**: vLLM, Transformers (vLLM only available for quantization none) +- **Model ID:** deepseek-ai/DeepSeek-R1-Distill-Qwen-7B +- **Model Hubs**: `Hugging Face `__, `ModelScope `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-engine ${engine} --model-name deepseek-r1-distill-qwen --size-in-billions 7 --model-format pytorch --quantization ${quantization} + + +Model Spec 7 (awq, 7 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** awq +- **Model Size (in billions):** 7 +- **Quantizations:** Int4 +- **Engines**: vLLM, Transformers +- **Model ID:** jakiAJK/DeepSeek-R1-Distill-Qwen-7B_AWQ +- **Model Hubs**: `Hugging Face `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-engine ${engine} --model-name deepseek-r1-distill-qwen --size-in-billions 7 --model-format awq --quantization ${quantization} + + +Model Spec 8 (gptq, 7 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** gptq +- **Model Size (in billions):** 7 +- **Quantizations:** Int4 +- **Engines**: vLLM, Transformers +- **Model ID:** jakiAJK/DeepSeek-R1-Distill-Qwen-7B_GPTQ-int4 +- **Model Hubs**: `Hugging Face `__, `ModelScope `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-engine ${engine} --model-name deepseek-r1-distill-qwen --size-in-billions 7 --model-format gptq --quantization ${quantization} + + +Model Spec 9 (ggufv2, 7 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** ggufv2 +- **Model Size (in billions):** 7 +- **Quantizations:** Q2_K, Q2_K_L, Q3_K_M, Q4_K_M, Q5_K_M, Q6_K, Q8_0, F16 +- **Engines**: llama.cpp +- **Model ID:** unsloth/DeepSeek-R1-Distill-Qwen-7B-GGUF +- **Model Hubs**: `Hugging Face `__, `ModelScope `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-engine ${engine} --model-name deepseek-r1-distill-qwen --size-in-billions 7 --model-format ggufv2 --quantization ${quantization} + + +Model Spec 10 (mlx, 7 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** mlx +- **Model Size (in billions):** 7 +- **Quantizations:** 3bit, 4bit, 6bit, 8bit, bf16 +- **Engines**: MLX +- **Model ID:** mlx-community/DeepSeek-R1-Distill-Qwen-7B-{quantization} +- **Model Hubs**: `Hugging Face `__, `ModelScope `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-engine ${engine} --model-name deepseek-r1-distill-qwen --size-in-billions 7 --model-format mlx --quantization ${quantization} + + +Model Spec 11 (pytorch, 14 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** pytorch +- **Model Size (in billions):** 14 +- **Quantizations:** 4-bit, 8-bit, none +- **Engines**: vLLM, Transformers (vLLM only available for quantization none) +- **Model ID:** deepseek-ai/DeepSeek-R1-Distill-Qwen-14B +- **Model Hubs**: `Hugging Face `__, `ModelScope `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-engine ${engine} --model-name deepseek-r1-distill-qwen --size-in-billions 14 --model-format pytorch --quantization ${quantization} + + +Model Spec 12 (awq, 14 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** awq +- **Model Size (in billions):** 14 +- **Quantizations:** Int4 +- **Engines**: vLLM, Transformers +- **Model ID:** casperhansen/deepseek-r1-distill-qwen-14b-awq +- **Model Hubs**: `Hugging Face `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-engine ${engine} --model-name deepseek-r1-distill-qwen --size-in-billions 14 --model-format awq --quantization ${quantization} + + +Model Spec 13 (ggufv2, 14 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** ggufv2 +- **Model Size (in billions):** 14 +- **Quantizations:** Q2_K, Q2_K_L, Q3_K_M, Q4_K_M, Q5_K_M, Q6_K, Q8_0, F16 +- **Engines**: llama.cpp +- **Model ID:** unsloth/DeepSeek-R1-Distill-Qwen-14B-GGUF +- **Model Hubs**: `Hugging Face `__, `ModelScope `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-engine ${engine} --model-name deepseek-r1-distill-qwen --size-in-billions 14 --model-format ggufv2 --quantization ${quantization} + + +Model Spec 14 (mlx, 14 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** mlx +- **Model Size (in billions):** 14 +- **Quantizations:** 3bit, 4bit, 6bit, 8bit, bf16 +- **Engines**: MLX +- **Model ID:** mlx-community/DeepSeek-R1-Distill-Qwen-14B-{quantization} +- **Model Hubs**: `Hugging Face `__, `ModelScope `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-engine ${engine} --model-name deepseek-r1-distill-qwen --size-in-billions 14 --model-format mlx --quantization ${quantization} + + +Model Spec 15 (pytorch, 32 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** pytorch +- **Model Size (in billions):** 32 +- **Quantizations:** 4-bit, 8-bit, none +- **Engines**: vLLM, Transformers (vLLM only available for quantization none) +- **Model ID:** deepseek-ai/DeepSeek-R1-Distill-Qwen-32B +- **Model Hubs**: `Hugging Face `__, `ModelScope `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-engine ${engine} --model-name deepseek-r1-distill-qwen --size-in-billions 32 --model-format pytorch --quantization ${quantization} + + +Model Spec 16 (awq, 32 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** awq +- **Model Size (in billions):** 32 +- **Quantizations:** Int4 +- **Engines**: vLLM, Transformers +- **Model ID:** casperhansen/deepseek-r1-distill-qwen-32b-awq +- **Model Hubs**: `Hugging Face `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-engine ${engine} --model-name deepseek-r1-distill-qwen --size-in-billions 32 --model-format awq --quantization ${quantization} + + +Model Spec 17 (ggufv2, 32 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** ggufv2 +- **Model Size (in billions):** 32 +- **Quantizations:** Q2_K, Q2_K_L, Q3_K_M, Q4_K_M, Q5_K_M, Q6_K, Q8_0, F16 +- **Engines**: llama.cpp +- **Model ID:** unsloth/DeepSeek-R1-Distill-Qwen-32B-GGUF +- **Model Hubs**: `Hugging Face `__, `ModelScope `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-engine ${engine} --model-name deepseek-r1-distill-qwen --size-in-billions 32 --model-format ggufv2 --quantization ${quantization} + + +Model Spec 18 (mlx, 32 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** mlx +- **Model Size (in billions):** 32 +- **Quantizations:** 3bit, 4bit, 6bit, 8bit, bf16 +- **Engines**: MLX +- **Model ID:** mlx-community/DeepSeek-R1-Distill-Qwen-32B-{quantization} +- **Model Hubs**: `Hugging Face `__, `ModelScope `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-engine ${engine} --model-name deepseek-r1-distill-qwen --size-in-billions 32 --model-format mlx --quantization ${quantization} + diff --git a/doc/source/models/builtin/llm/index.rst b/doc/source/models/builtin/llm/index.rst index 38165aa947..204c3a9424 100644 --- a/doc/source/models/builtin/llm/index.rst +++ b/doc/source/models/builtin/llm/index.rst @@ -131,6 +131,11 @@ The following is a list of built-in LLM in Xinference: - 16384 - deepseek-coder-instruct is a model initialized from deepseek-coder-base and fine-tuned on 2B tokens of instruction data. + * - :ref:`deepseek-r1-distill-qwen ` + - chat + - 32768 + - deepseek-r1-distill-qwen is distilled from DeepSeek-R1 based on Qwen + * - :ref:`deepseek-v2 ` - generate - 128000 @@ -632,6 +637,8 @@ The following is a list of built-in LLM in Xinference: deepseek-coder-instruct + deepseek-r1-distill-qwen + deepseek-v2 deepseek-v2-chat diff --git a/doc/source/user_guide/backends.rst b/doc/source/user_guide/backends.rst index ba610e5d89..9bdfb21b25 100644 --- a/doc/source/user_guide/backends.rst +++ b/doc/source/user_guide/backends.rst @@ -51,7 +51,7 @@ Currently, supported model includes: - ``codestral-v0.1`` - ``Yi``, ``Yi-1.5``, ``Yi-chat``, ``Yi-1.5-chat``, ``Yi-1.5-chat-16k`` - ``code-llama``, ``code-llama-python``, ``code-llama-instruct`` -- ``deepseek``, ``deepseek-coder``, ``deepseek-chat``, ``deepseek-coder-instruct``, ``deepseek-v2-chat``, ``deepseek-v2-chat-0628``, ``deepseek-v2.5`` +- ``deepseek``, ``deepseek-coder``, ``deepseek-chat``, ``deepseek-coder-instruct``, ``deepseek-r1-distill-qwen``, ``deepseek-v2-chat``, ``deepseek-v2-chat-0628``, ``deepseek-v2.5`` - ``yi-coder``, ``yi-coder-chat`` - ``codeqwen1.5``, ``codeqwen1.5-chat`` - ``qwen2.5``, ``qwen2.5-coder``, ``qwen2.5-instruct``, ``qwen2.5-coder-instruct`` diff --git a/xinference/core/chat_interface.py b/xinference/core/chat_interface.py index 08b30ab054..b47461e18c 100644 --- a/xinference/core/chat_interface.py +++ b/xinference/core/chat_interface.py @@ -13,6 +13,7 @@ # limitations under the License. import base64 +import html import logging import os from io import BytesIO @@ -137,7 +138,11 @@ def generate_wrapper( if "content" not in delta: continue else: - response_content += delta["content"] + # some model like deepseek-r1-distill-qwen + # will generate ... ... + # in gradio, no output will be rendered, + # thus escape html tags in advance + response_content += html.escape(delta["content"]) yield response_content yield response_content diff --git a/xinference/deploy/docker/requirements.txt b/xinference/deploy/docker/requirements.txt index c9d1aa8de1..6a611e655b 100644 --- a/xinference/deploy/docker/requirements.txt +++ b/xinference/deploy/docker/requirements.txt @@ -113,5 +113,5 @@ pyzmq>=25.1.2 torchao>=0.7.0 uvloop xgrammar>=0.1.6 -vllm>=0.6.3.post1,<=0.6.4.post1 +vllm==0.6.4.post1 cuda-python diff --git a/xinference/model/llm/llm_family.json b/xinference/model/llm/llm_family.json index 5088e0ef46..504a0fc293 100644 --- a/xinference/model/llm/llm_family.json +++ b/xinference/model/llm/llm_family.json @@ -8716,6 +8716,227 @@ "<|im_end|>" ] }, + { + "version": 1, + "context_length": 131072, + "model_name": "deepseek-r1-distill-qwen", + "model_lang": [ + "en", + "zh" + ], + "model_ability": [ + "chat" + ], + "model_description": "deepseek-r1-distill-qwen is distilled from DeepSeek-R1 based on Qwen", + "model_specs": [ + { + "model_format": "pytorch", + "model_size_in_billions": "1_5", + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B" + }, + { + "model_format": "awq", + "model_size_in_billions": "1_5", + "quantizations": [ + "Int4" + ], + "model_id": "casperhansen/deepseek-r1-distill-qwen-1.5b-awq" + }, + { + "model_format": "gptq", + "model_size_in_billions": "1_5", + "quantizations": [ + "Int4" + ], + "model_id": "jakiAJK/DeepSeek-R1-Distill-Qwen-1.5B_GPTQ-int4" + }, + { + "model_format": "ggufv2", + "model_size_in_billions": "1_5", + "quantizations": [ + "Q2_K", + "Q2_K_L", + "Q3_K_M", + "Q4_K_M", + "Q5_K_M", + "Q6_K", + "Q8_0" + ], + "model_id": "unsloth/DeepSeek-R1-Distill-Qwen-1.5B-GGUF", + "model_file_name_template": "DeepSeek-R1-Distill-Qwen-1.5B-{quantization}.gguf" + }, + { + "model_format": "mlx", + "model_size_in_billions": "1_5", + "quantizations": [ + "3bit", + "4bit", + "6bit", + "8bit", + "bf16" + ], + "model_id": "mlx-community/DeepSeek-R1-Distill-Qwen-1.5B-{quantization}" + }, + { + "model_format": "pytorch", + "model_size_in_billions": 7, + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B" + }, + { + "model_format": "awq", + "model_size_in_billions": 7, + "quantizations": [ + "Int4" + ], + "model_id": "jakiAJK/DeepSeek-R1-Distill-Qwen-7B_AWQ" + }, + { + "model_format": "gptq", + "model_size_in_billions": 7, + "quantizations": [ + "Int4" + ], + "model_id": "jakiAJK/DeepSeek-R1-Distill-Qwen-7B_GPTQ-int4" + }, + { + "model_format": "ggufv2", + "model_size_in_billions": 7, + "quantizations": [ + "Q2_K", + "Q2_K_L", + "Q3_K_M", + "Q4_K_M", + "Q5_K_M", + "Q6_K", + "Q8_0", + "F16" + ], + "model_id": "unsloth/DeepSeek-R1-Distill-Qwen-7B-GGUF", + "model_file_name_template": "DeepSeek-R1-Distill-Qwen-7B-{quantization}.gguf" + }, + { + "model_format": "mlx", + "model_size_in_billions": 7, + "quantizations": [ + "3bit", + "4bit", + "6bit", + "8bit", + "bf16" + ], + "model_id": "mlx-community/DeepSeek-R1-Distill-Qwen-7B-{quantization}" + }, + { + "model_format": "pytorch", + "model_size_in_billions": 14, + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B" + }, + { + "model_format": "awq", + "model_size_in_billions": 14, + "quantizations": [ + "Int4" + ], + "model_id": "casperhansen/deepseek-r1-distill-qwen-14b-awq" + }, + { + "model_format": "ggufv2", + "model_size_in_billions": 14, + "quantizations": [ + "Q2_K", + "Q2_K_L", + "Q3_K_M", + "Q4_K_M", + "Q5_K_M", + "Q6_K", + "Q8_0", + "F16" + ], + "model_id": "unsloth/DeepSeek-R1-Distill-Qwen-14B-GGUF", + "model_file_name_template": "DeepSeek-R1-Distill-Qwen-14B-{quantization}.gguf" + }, + { + "model_format": "mlx", + "model_size_in_billions": 14, + "quantizations": [ + "3bit", + "4bit", + "6bit", + "8bit", + "bf16" + ], + "model_id": "mlx-community/DeepSeek-R1-Distill-Qwen-14B-{quantization}" + }, + { + "model_format": "pytorch", + "model_size_in_billions": 32, + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B" + }, + { + "model_format": "awq", + "model_size_in_billions": 32, + "quantizations": [ + "Int4" + ], + "model_id": "casperhansen/deepseek-r1-distill-qwen-32b-awq" + }, + { + "model_format": "ggufv2", + "model_size_in_billions": 32, + "quantizations": [ + "Q2_K", + "Q2_K_L", + "Q3_K_M", + "Q4_K_M", + "Q5_K_M", + "Q6_K", + "Q8_0", + "F16" + ], + "model_id": "unsloth/DeepSeek-R1-Distill-Qwen-32B-GGUF", + "model_file_name_template": "DeepSeek-R1-Distill-Qwen-32B-{quantization}.gguf" + }, + { + "model_format": "mlx", + "model_size_in_billions": 32, + "quantizations": [ + "3bit", + "4bit", + "6bit", + "8bit", + "bf16" + ], + "model_id": "mlx-community/DeepSeek-R1-Distill-Qwen-32B-{quantization}" + } + ], + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '' in content %}{% set content = content.split('')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>'}}{% endif %}", + "stop_token_ids": [ + 151643 + ], + "stop": [ + "<|end▁of▁sentence|>" + ] + }, { "version": 1, "context_length": 8192, diff --git a/xinference/model/llm/llm_family_modelscope.json b/xinference/model/llm/llm_family_modelscope.json index 762bcdb690..9dd2390f5b 100644 --- a/xinference/model/llm/llm_family_modelscope.json +++ b/xinference/model/llm/llm_family_modelscope.json @@ -6433,6 +6433,194 @@ "<|im_end|>" ] }, + { + "version": 1, + "context_length": 131072, + "model_name": "deepseek-r1-distill-qwen", + "model_lang": [ + "en", + "zh" + ], + "model_ability": [ + "chat" + ], + "model_description": "deepseek-r1-distill-qwen is distilled from DeepSeek-R1 based on Qwen", + "model_specs": [ + { + "model_format": "pytorch", + "model_size_in_billions": "1_5", + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", + "model_hub": "modelscope" + }, + { + "model_format": "ggufv2", + "model_size_in_billions": "1_5", + "quantizations": [ + "Q2_K", + "Q2_K_L", + "Q3_K_M", + "Q4_K_M", + "Q5_K_M", + "Q6_K", + "Q8_0" + ], + "model_id": "unsloth/DeepSeek-R1-Distill-Qwen-1.5B-GGUF", + "model_file_name_template": "DeepSeek-R1-Distill-Qwen-1.5B-{quantization}.gguf", + "model_hub": "modelscope" + }, + { + "model_format": "pytorch", + "model_size_in_billions": 7, + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B", + "model_hub": "modelscope" + }, + { + "model_format": "gptq", + "model_size_in_billions": 7, + "quantizations": [ + "Int4" + ], + "model_id": "tclf90/deepseek-r1-distill-qwen-7b-gptq-int4", + "model_hub": "modelscope" + }, + { + "model_format": "ggufv2", + "model_size_in_billions": 7, + "quantizations": [ + "Q2_K", + "Q2_K_L", + "Q3_K_M", + "Q4_K_M", + "Q5_K_M", + "Q6_K", + "Q8_0", + "F16" + ], + "model_id": "unsloth/DeepSeek-R1-Distill-Qwen-7B-GGUF", + "model_file_name_template": "DeepSeek-R1-Distill-Qwen-7B-{quantization}.gguf", + "model_hub": "modelscope" + }, + { + "model_format": "mlx", + "model_size_in_billions": 7, + "quantizations": [ + "3bit", + "4bit", + "6bit", + "8bit" + ], + "model_id": "okwinds/DeepSeek-R1-Distill-Qwen-7B-MLX-{quantization}", + "model_hub": "modelscope" + }, + { + "model_format": "pytorch", + "model_size_in_billions": 14, + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B", + "model_hub": "modelscope" + }, + { + "model_format": "ggufv2", + "model_size_in_billions": 14, + "quantizations": [ + "Q2_K", + "Q2_K_L", + "Q3_K_M", + "Q4_K_M", + "Q5_K_M", + "Q6_K", + "Q8_0", + "F16" + ], + "model_id": "unsloth/DeepSeek-R1-Distill-Qwen-14B-GGUF", + "model_file_name_template": "DeepSeek-R1-Distill-Qwen-14B-{quantization}.gguf", + "model_hub": "modelscope" + }, + { + "model_format": "mlx", + "model_size_in_billions": 14, + "quantizations": [ + "3bit", + "4bit", + "6bit", + "8bit" + ], + "model_id": "okwinds/DeepSeek-R1-Distill-Qwen-14B-MLX-{quantization}", + "model_hub": "modelscope" + }, + { + "model_format": "pytorch", + "model_size_in_billions": 32, + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B", + "model_hub": "modelscope" + }, + { + "model_format": "gptq", + "model_size_in_billions": 32, + "quantizations": [ + "Int4" + ], + "model_id": "tclf90/deepseek-r1-distill-qwen-32b-gptq-int4", + "model_hub": "modelscope" + }, + { + "model_format": "ggufv2", + "model_size_in_billions": 32, + "quantizations": [ + "Q2_K", + "Q2_K_L", + "Q3_K_M", + "Q4_K_M", + "Q5_K_M", + "Q6_K", + "Q8_0", + "F16" + ], + "model_id": "unsloth/DeepSeek-R1-Distill-Qwen-32B-GGUF", + "model_file_name_template": "DeepSeek-R1-Distill-Qwen-32B-{quantization}.gguf", + "model_hub": "modelscope" + }, + { + "model_format": "mlx", + "model_size_in_billions": 32, + "quantizations": [ + "2bit", + "3bit", + "4bit", + "6bit", + "8bit" + ], + "model_id": "okwinds/DeepSeek-R1-Distill-Qwen-32B-MLX-{quantization}", + "model_hub": "modelscope" + } + ], + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '' in content %}{% set content = content.split('')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>'}}{% endif %}", + "stop_token_ids": [ + 151643 + ], + "stop": [ + "<|end▁of▁sentence|>" + ] + }, { "version": 1, "context_length": 8192, diff --git a/xinference/model/llm/vllm/core.py b/xinference/model/llm/vllm/core.py index b9a8e5388b..92fc2e6dbc 100644 --- a/xinference/model/llm/vllm/core.py +++ b/xinference/model/llm/vllm/core.py @@ -157,7 +157,7 @@ class VLLMGenerateConfig(TypedDict, total=False): VLLM_SUPPORTED_CHAT_MODELS.append("qwen2.5-coder-instruct") VLLM_SUPPORTED_CHAT_MODELS.append("QwQ-32B-Preview") VLLM_SUPPORTED_CHAT_MODELS.append("marco-o1") - + VLLM_SUPPORTED_CHAT_MODELS.append("deepseek-r1-distill-qwen") if VLLM_INSTALLED and vllm.__version__ >= "0.3.2": VLLM_SUPPORTED_CHAT_MODELS.append("gemma-it")