From a57b99b07b40d1082f69a8fc5b968d56bc3636bc Mon Sep 17 00:00:00 2001
From: Xuye Qin <qinxuye@gmail.com>
Date: Fri, 24 Jan 2025 16:52:57 +0800
Subject: [PATCH] FEAT: support deepseek-r1-distill-qwen (#2781)

---
 README.md                                     |  11 +-
 README_ja_JP.md                               |  14 +-
 README_zh_CN.md                               |  14 +-
 doc/source/getting_started/installation.rst   |   2 +-
 .../builtin/llm/deepseek-r1-distill-qwen.rst  | 303 ++++++++++++++++++
 doc/source/models/builtin/llm/index.rst       |   7 +
 doc/source/user_guide/backends.rst            |   2 +-
 xinference/core/chat_interface.py             |   7 +-
 xinference/deploy/docker/requirements.txt     |   2 +-
 xinference/model/llm/llm_family.json          | 221 +++++++++++++
 .../model/llm/llm_family_modelscope.json      | 188 +++++++++++
 xinference/model/llm/vllm/core.py             |   2 +-
 12 files changed, 748 insertions(+), 25 deletions(-)
 create mode 100644 doc/source/models/builtin/llm/deepseek-r1-distill-qwen.rst
diff --git a/README.md b/README.md
index 54481cbe24..9df06e80f1 100644
--- a/README.md
+++ b/README.md
@@ -14,7 +14,6 @@
 [![License](https://img.shields.io/pypi/l/xinference.svg?style=for-the-badge)](https://github.com/xorbitsai/inference/blob/main/LICENSE)
 [![Build Status](https://img.shields.io/github/actions/workflow/status/xorbitsai/inference/python.yaml?branch=main&style=for-the-badge&label=GITHUB%20ACTIONS&logo=github)](https://actions-badge.atrox.dev/xorbitsai/inference/goto?ref=main)
 [![Discord](https://img.shields.io/badge/join_Discord-5462eb.svg?logo=discord&style=for-the-badge&logoColor=%23f5f5f5)](https://discord.gg/Xw9tszSkr5)
-[![Slack](https://img.shields.io/badge/join_Slack-781FF5.svg?logo=slack&style=for-the-badge)](https://join.slack.com/t/xorbitsio/shared_invite/zt-1o3z9ucdh-RbfhbPVpx7prOVdM1CAuxg)
 [![Twitter](https://img.shields.io/twitter/follow/xorbitsio?logo=x&style=for-the-badge)](https://twitter.com/xorbitsio)
 
 <p align="center">
@@ -177,11 +176,11 @@ Once Xinference is running, there are multiple ways you can try it: via the web
 
 ## Getting involved
 
-| Platform                                                                                      | Purpose                                            |
-|-----------------------------------------------------------------------------------------------|----------------------------------------------------|
-| [Github Issues](https://github.com/xorbitsai/inference/issues)                                | Reporting bugs and filing feature requests.        |
-| [Slack](https://join.slack.com/t/xorbitsio/shared_invite/zt-1o3z9ucdh-RbfhbPVpx7prOVdM1CAuxg) | Collaborating with other Xorbits users.            |
-| [Twitter](https://twitter.com/xorbitsio)                                                      | Staying up-to-date on new features.                |
+| Platform                                                                                        | Purpose                                     |
+|-------------------------------------------------------------------------------------------------|---------------------------------------------|
+| [Github Issues](https://github.com/xorbitsai/inference/issues)                                  | Reporting bugs and filing feature requests. |
+| [Discord](https://discord.gg/Xw9tszSkr5) | Collaborating with other Xinference users.  |
+| [Twitter](https://twitter.com/xorbitsio)                                                        | Staying up-to-date on new features.         |
 
 ## Citation
 
diff --git a/README_ja_JP.md b/README_ja_JP.md
index ff3f0e4861..491c4d913c 100644
--- a/README_ja_JP.md
+++ b/README_ja_JP.md
@@ -6,7 +6,7 @@
 [![PyPI Latest Release](https://img.shields.io/pypi/v/xinference.svg?style=for-the-badge)](https://pypi.org/project/xinference/)
 [![License](https://img.shields.io/pypi/l/xinference.svg?style=for-the-badge)](https://github.com/xorbitsai/inference/blob/main/LICENSE)
 [![Build Status](https://img.shields.io/github/actions/workflow/status/xorbitsai/inference/python.yaml?branch=main&style=for-the-badge&label=GITHUB%20ACTIONS&logo=github)](https://actions-badge.atrox.dev/xorbitsai/inference/goto?ref=main)
-[![Slack](https://img.shields.io/badge/join_Slack-781FF5.svg?logo=slack&style=for-the-badge)](https://join.slack.com/t/xorbitsio/shared_invite/zt-1o3z9ucdh-RbfhbPVpx7prOVdM1CAuxg)
+[![Discord](https://img.shields.io/badge/join_Discord-5462eb.svg?logo=discord&style=for-the-badge&logoColor=%23f5f5f5)](https://discord.gg/Xw9tszSkr5)
 [![Twitter](https://img.shields.io/twitter/follow/xorbitsio?logo=x&style=for-the-badge)](https://twitter.com/xorbitsio)
 
 <p align="center">
@@ -24,7 +24,7 @@ Xorbits Inference(Xinference) は、言語、音声認識、マルチモーダ
 研究者、開発者、データサイエンティストを問わず、最先端の AI モデルの可能性を最大限に引き出すことができます。
 
 <div align="center">
-<i><a href="https://join.slack.com/t/xorbitsio/shared_invite/zt-1z3zsm9ep-87yI9YZ_B79HLB2ccTq4WA">👉 Slack コミュニティにご参加ください！</a></i>
+<i><a href="https://discord.gg/Xw9tszSkr5">👉 Discord コミュニティにご参加ください！</a></i>
 </div>
 
 
@@ -98,11 +98,11 @@ Xinferenceが実行されると、Web UI、cURL、コマンドライン、また
 
 ## 関与する
 
-| プラットフォーム                                                                                   | 目的                                              |
-|---------------------------------------------------------------------------------------------------|--------------------------------------------------|
-| [Github イシュー](https://github.com/xorbitsai/inference/issues)                                  | バグ報告と機能リクエストの提出。                  |
-| [Slack](https://join.slack.com/t/xorbitsio/shared_invite/zt-1o3z9ucdh-RbfhbPVpx7prOVdM1CAuxg)      | 他のXorbitsユーザーとの協力。                      |
-| [Twitter](https://twitter.com/xorbitsio)                                                          | 新機能に関する最新情報の入手。                    |
+| プラットフォーム                                                                                        | 目的                    |
+|-------------------------------------------------------------------------------------------------|-----------------------|
+| [Github イシュー](https://github.com/xorbitsai/inference/issues)                                    | バグ報告と機能リクエストの提出。      |
+| [Discord](https://discord.gg/Xw9tszSkr5) | 他のXinferenceユーザーとの協力。 |
+| [Twitter](https://twitter.com/xorbitsio)                                                        | 新機能に関する最新情報の入手。       |
 
 ## 引用
 
diff --git a/README_zh_CN.md b/README_zh_CN.md
index afd3054b8d..b9cb0d471b 100644
--- a/README_zh_CN.md
+++ b/README_zh_CN.md
@@ -158,13 +158,13 @@ $ xinference-local
 
 ## 参与其中
 
-| 平台                                                                                          | 目的                                              |
-|------------------------------------------------------------------------------------------------|--------------------------------------------------|
-| [Github 问题](https://github.com/xorbitsai/inference/issues)                                  | 报告错误和提交功能请求。                          |
-| [Slack](https://join.slack.com/t/xorbitsio/shared_invite/zt-1o3z9ucdh-RbfhbPVpx7prOVdM1CAuxg)   | 与其他 Xorbits 用户合作。                          |
-| [Twitter](https://twitter.com/xorbitsio)                                                     | 及时了解新功能。                                  |
-| [微信社群](https://xorbits.cn/assets/images/wechat_work_qr.png)                                     | 与其他 Xorbits 用户交流。                         |
-| [知乎](https://zhihu.com/org/xorbits)                                                         | 了解团队最新的进展。                                  |
+| 平台                                                                                              | 目的                   |
+|-------------------------------------------------------------------------------------------------|----------------------|
+| [Github 问题](https://github.com/xorbitsai/inference/issues)                                      | 报告错误和提交功能请求。         |
+| [Discord](https://discord.gg/Xw9tszSkr5) | 与其他 Xinference 用户合作。 |
+| [Twitter](https://twitter.com/xorbitsio)                                                        | 及时了解新功能。             |
+| [微信社群](https://xorbits.cn/assets/images/wechat_work_qr.png)                                     | 与其他 Xinference 用户交流。 |
+| [知乎](https://zhihu.com/org/xorbits)                                                             | 了解团队最新的进展。           |
 
 ## 引用
 
diff --git a/doc/source/getting_started/installation.rst b/doc/source/getting_started/installation.rst
index ad973c697a..84fc13e153 100644
--- a/doc/source/getting_started/installation.rst
+++ b/doc/source/getting_started/installation.rst
@@ -44,7 +44,7 @@ Currently, supported models include:
 - ``codestral-v0.1``
 - ``Yi``, ``Yi-1.5``, ``Yi-chat``, ``Yi-1.5-chat``, ``Yi-1.5-chat-16k``
 - ``code-llama``, ``code-llama-python``, ``code-llama-instruct``
-- ``deepseek``, ``deepseek-coder``, ``deepseek-chat``, ``deepseek-coder-instruct``, ``deepseek-v2-chat``, ``deepseek-v2-chat-0628``, ``deepseek-v2.5``
+- ``deepseek``, ``deepseek-coder``, ``deepseek-chat``, ``deepseek-coder-instruct``, ``deepseek-r1-distill-qwen``, ``deepseek-v2-chat``, ``deepseek-v2-chat-0628``, ``deepseek-v2.5``
 - ``yi-coder``, ``yi-coder-chat``
 - ``codeqwen1.5``, ``codeqwen1.5-chat``
 - ``qwen2.5``, ``qwen2.5-coder``, ``qwen2.5-instruct``, ``qwen2.5-coder-instruct``
diff --git a/doc/source/models/builtin/llm/deepseek-r1-distill-qwen.rst b/doc/source/models/builtin/llm/deepseek-r1-distill-qwen.rst
new file mode 100644
index 0000000000..edaac6ea35
--- /dev/null
+++ b/doc/source/models/builtin/llm/deepseek-r1-distill-qwen.rst
@@ -0,0 +1,303 @@
+.. _models_llm_deepseek-r1-distill-qwen:
+
+========================================
+deepseek-r1-distill-qwen
+========================================
+
+- **Context Length:** 131072
+- **Model Name:** deepseek-r1-distill-qwen
+- **Languages:** en, zh
+- **Abilities:** chat
+- **Description:** deepseek-r1-distill-qwen is distilled from DeepSeek-R1 based on Qwen
+
+Specifications
+^^^^^^^^^^^^^^
+
+
+Model Spec 1 (pytorch, 1_5 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 1_5
+- **Quantizations:** 4-bit, 8-bit, none
+- **Engines**: vLLM, Transformers (vLLM only available for quantization none)
+- **Model ID:** deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B>`__, `ModelScope <https://modelscope.cn/models/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-engine ${engine} --model-name deepseek-r1-distill-qwen --size-in-billions 1_5 --model-format pytorch --quantization ${quantization}
+
+
+Model Spec 2 (awq, 1_5 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** awq
+- **Model Size (in billions):** 1_5
+- **Quantizations:** Int4
+- **Engines**: vLLM, Transformers
+- **Model ID:** casperhansen/deepseek-r1-distill-qwen-1.5b-awq
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/casperhansen/deepseek-r1-distill-qwen-1.5b-awq>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-engine ${engine} --model-name deepseek-r1-distill-qwen --size-in-billions 1_5 --model-format awq --quantization ${quantization}
+
+
+Model Spec 3 (gptq, 1_5 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** gptq
+- **Model Size (in billions):** 1_5
+- **Quantizations:** Int4
+- **Engines**: vLLM, Transformers
+- **Model ID:** jakiAJK/DeepSeek-R1-Distill-Qwen-1.5B_GPTQ-int4
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/jakiAJK/DeepSeek-R1-Distill-Qwen-1.5B_GPTQ-int4>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-engine ${engine} --model-name deepseek-r1-distill-qwen --size-in-billions 1_5 --model-format gptq --quantization ${quantization}
+
+
+Model Spec 4 (ggufv2, 1_5 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** ggufv2
+- **Model Size (in billions):** 1_5
+- **Quantizations:** Q2_K, Q2_K_L, Q3_K_M, Q4_K_M, Q5_K_M, Q6_K, Q8_0
+- **Engines**: llama.cpp
+- **Model ID:** unsloth/DeepSeek-R1-Distill-Qwen-1.5B-GGUF
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/unsloth/DeepSeek-R1-Distill-Qwen-1.5B-GGUF>`__, `ModelScope <https://modelscope.cn/models/unsloth/DeepSeek-R1-Distill-Qwen-1.5B-GGUF>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-engine ${engine} --model-name deepseek-r1-distill-qwen --size-in-billions 1_5 --model-format ggufv2 --quantization ${quantization}
+
+
+Model Spec 5 (mlx, 1_5 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** mlx
+- **Model Size (in billions):** 1_5
+- **Quantizations:** 3bit, 4bit, 6bit, 8bit, bf16
+- **Engines**: MLX
+- **Model ID:** mlx-community/DeepSeek-R1-Distill-Qwen-1.5B-{quantization}
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/mlx-community/DeepSeek-R1-Distill-Qwen-1.5B-{quantization}>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-engine ${engine} --model-name deepseek-r1-distill-qwen --size-in-billions 1_5 --model-format mlx --quantization ${quantization}
+
+
+Model Spec 6 (pytorch, 7 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 7
+- **Quantizations:** 4-bit, 8-bit, none
+- **Engines**: vLLM, Transformers (vLLM only available for quantization none)
+- **Model ID:** deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B>`__, `ModelScope <https://modelscope.cn/models/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-engine ${engine} --model-name deepseek-r1-distill-qwen --size-in-billions 7 --model-format pytorch --quantization ${quantization}
+
+
+Model Spec 7 (awq, 7 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** awq
+- **Model Size (in billions):** 7
+- **Quantizations:** Int4
+- **Engines**: vLLM, Transformers
+- **Model ID:** jakiAJK/DeepSeek-R1-Distill-Qwen-7B_AWQ
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/jakiAJK/DeepSeek-R1-Distill-Qwen-7B_AWQ>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-engine ${engine} --model-name deepseek-r1-distill-qwen --size-in-billions 7 --model-format awq --quantization ${quantization}
+
+
+Model Spec 8 (gptq, 7 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** gptq
+- **Model Size (in billions):** 7
+- **Quantizations:** Int4
+- **Engines**: vLLM, Transformers
+- **Model ID:** jakiAJK/DeepSeek-R1-Distill-Qwen-7B_GPTQ-int4
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/jakiAJK/DeepSeek-R1-Distill-Qwen-7B_GPTQ-int4>`__, `ModelScope <https://modelscope.cn/models/tclf90/deepseek-r1-distill-qwen-7b-gptq-int4>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-engine ${engine} --model-name deepseek-r1-distill-qwen --size-in-billions 7 --model-format gptq --quantization ${quantization}
+
+
+Model Spec 9 (ggufv2, 7 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** ggufv2
+- **Model Size (in billions):** 7
+- **Quantizations:** Q2_K, Q2_K_L, Q3_K_M, Q4_K_M, Q5_K_M, Q6_K, Q8_0, F16
+- **Engines**: llama.cpp
+- **Model ID:** unsloth/DeepSeek-R1-Distill-Qwen-7B-GGUF
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/unsloth/DeepSeek-R1-Distill-Qwen-7B-GGUF>`__, `ModelScope <https://modelscope.cn/models/unsloth/DeepSeek-R1-Distill-Qwen-7B-GGUF>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-engine ${engine} --model-name deepseek-r1-distill-qwen --size-in-billions 7 --model-format ggufv2 --quantization ${quantization}
+
+
+Model Spec 10 (mlx, 7 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** mlx
+- **Model Size (in billions):** 7
+- **Quantizations:** 3bit, 4bit, 6bit, 8bit, bf16
+- **Engines**: MLX
+- **Model ID:** mlx-community/DeepSeek-R1-Distill-Qwen-7B-{quantization}
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/mlx-community/DeepSeek-R1-Distill-Qwen-7B-{quantization}>`__, `ModelScope <https://modelscope.cn/models/okwinds/DeepSeek-R1-Distill-Qwen-7B-MLX-{quantization}>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-engine ${engine} --model-name deepseek-r1-distill-qwen --size-in-billions 7 --model-format mlx --quantization ${quantization}
+
+
+Model Spec 11 (pytorch, 14 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 14
+- **Quantizations:** 4-bit, 8-bit, none
+- **Engines**: vLLM, Transformers (vLLM only available for quantization none)
+- **Model ID:** deepseek-ai/DeepSeek-R1-Distill-Qwen-14B
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-14B>`__, `ModelScope <https://modelscope.cn/models/deepseek-ai/DeepSeek-R1-Distill-Qwen-14B>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-engine ${engine} --model-name deepseek-r1-distill-qwen --size-in-billions 14 --model-format pytorch --quantization ${quantization}
+
+
+Model Spec 12 (awq, 14 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** awq
+- **Model Size (in billions):** 14
+- **Quantizations:** Int4
+- **Engines**: vLLM, Transformers
+- **Model ID:** casperhansen/deepseek-r1-distill-qwen-14b-awq
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/casperhansen/deepseek-r1-distill-qwen-14b-awq>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-engine ${engine} --model-name deepseek-r1-distill-qwen --size-in-billions 14 --model-format awq --quantization ${quantization}
+
+
+Model Spec 13 (ggufv2, 14 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** ggufv2
+- **Model Size (in billions):** 14
+- **Quantizations:** Q2_K, Q2_K_L, Q3_K_M, Q4_K_M, Q5_K_M, Q6_K, Q8_0, F16
+- **Engines**: llama.cpp
+- **Model ID:** unsloth/DeepSeek-R1-Distill-Qwen-14B-GGUF
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/unsloth/DeepSeek-R1-Distill-Qwen-14B-GGUF>`__, `ModelScope <https://modelscope.cn/models/unsloth/DeepSeek-R1-Distill-Qwen-14B-GGUF>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-engine ${engine} --model-name deepseek-r1-distill-qwen --size-in-billions 14 --model-format ggufv2 --quantization ${quantization}
+
+
+Model Spec 14 (mlx, 14 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** mlx
+- **Model Size (in billions):** 14
+- **Quantizations:** 3bit, 4bit, 6bit, 8bit, bf16
+- **Engines**: MLX
+- **Model ID:** mlx-community/DeepSeek-R1-Distill-Qwen-14B-{quantization}
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/mlx-community/DeepSeek-R1-Distill-Qwen-14B-{quantization}>`__, `ModelScope <https://modelscope.cn/models/okwinds/DeepSeek-R1-Distill-Qwen-14B-MLX-{quantization}>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-engine ${engine} --model-name deepseek-r1-distill-qwen --size-in-billions 14 --model-format mlx --quantization ${quantization}
+
+
+Model Spec 15 (pytorch, 32 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 32
+- **Quantizations:** 4-bit, 8-bit, none
+- **Engines**: vLLM, Transformers (vLLM only available for quantization none)
+- **Model ID:** deepseek-ai/DeepSeek-R1-Distill-Qwen-32B
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B>`__, `ModelScope <https://modelscope.cn/models/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-engine ${engine} --model-name deepseek-r1-distill-qwen --size-in-billions 32 --model-format pytorch --quantization ${quantization}
+
+
+Model Spec 16 (awq, 32 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** awq
+- **Model Size (in billions):** 32
+- **Quantizations:** Int4
+- **Engines**: vLLM, Transformers
+- **Model ID:** casperhansen/deepseek-r1-distill-qwen-32b-awq
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/casperhansen/deepseek-r1-distill-qwen-32b-awq>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-engine ${engine} --model-name deepseek-r1-distill-qwen --size-in-billions 32 --model-format awq --quantization ${quantization}
+
+
+Model Spec 17 (ggufv2, 32 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** ggufv2
+- **Model Size (in billions):** 32
+- **Quantizations:** Q2_K, Q2_K_L, Q3_K_M, Q4_K_M, Q5_K_M, Q6_K, Q8_0, F16
+- **Engines**: llama.cpp
+- **Model ID:** unsloth/DeepSeek-R1-Distill-Qwen-32B-GGUF
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/unsloth/DeepSeek-R1-Distill-Qwen-32B-GGUF>`__, `ModelScope <https://modelscope.cn/models/unsloth/DeepSeek-R1-Distill-Qwen-32B-GGUF>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-engine ${engine} --model-name deepseek-r1-distill-qwen --size-in-billions 32 --model-format ggufv2 --quantization ${quantization}
+
+
+Model Spec 18 (mlx, 32 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** mlx
+- **Model Size (in billions):** 32
+- **Quantizations:** 3bit, 4bit, 6bit, 8bit, bf16
+- **Engines**: MLX
+- **Model ID:** mlx-community/DeepSeek-R1-Distill-Qwen-32B-{quantization}
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/mlx-community/DeepSeek-R1-Distill-Qwen-32B-{quantization}>`__, `ModelScope <https://modelscope.cn/models/okwinds/DeepSeek-R1-Distill-Qwen-32B-MLX-{quantization}>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-engine ${engine} --model-name deepseek-r1-distill-qwen --size-in-billions 32 --model-format mlx --quantization ${quantization}
+
diff --git a/doc/source/models/builtin/llm/index.rst b/doc/source/models/builtin/llm/index.rst
index 38165aa947..204c3a9424 100644
--- a/doc/source/models/builtin/llm/index.rst
+++ b/doc/source/models/builtin/llm/index.rst
@@ -131,6 +131,11 @@ The following is a list of built-in LLM in Xinference:
      - 16384
      - deepseek-coder-instruct is a model initialized from deepseek-coder-base and fine-tuned on 2B tokens of instruction data.
 
+   * - :ref:`deepseek-r1-distill-qwen <models_llm_deepseek-r1-distill-qwen>`
+     - chat
+     - 32768
+     - deepseek-r1-distill-qwen is distilled from DeepSeek-R1 based on Qwen
+
    * - :ref:`deepseek-v2 <models_llm_deepseek-v2>`
      - generate
      - 128000
@@ -632,6 +637,8 @@ The following is a list of built-in LLM in Xinference:
   
    deepseek-coder-instruct
   
+   deepseek-r1-distill-qwen
+  
    deepseek-v2
   
    deepseek-v2-chat
diff --git a/doc/source/user_guide/backends.rst b/doc/source/user_guide/backends.rst
index ba610e5d89..9bdfb21b25 100644
--- a/doc/source/user_guide/backends.rst
+++ b/doc/source/user_guide/backends.rst
@@ -51,7 +51,7 @@ Currently, supported model includes:
 - ``codestral-v0.1``
 - ``Yi``, ``Yi-1.5``, ``Yi-chat``, ``Yi-1.5-chat``, ``Yi-1.5-chat-16k``
 - ``code-llama``, ``code-llama-python``, ``code-llama-instruct``
-- ``deepseek``, ``deepseek-coder``, ``deepseek-chat``, ``deepseek-coder-instruct``, ``deepseek-v2-chat``, ``deepseek-v2-chat-0628``, ``deepseek-v2.5``
+- ``deepseek``, ``deepseek-coder``, ``deepseek-chat``, ``deepseek-coder-instruct``, ``deepseek-r1-distill-qwen``, ``deepseek-v2-chat``, ``deepseek-v2-chat-0628``, ``deepseek-v2.5``
 - ``yi-coder``, ``yi-coder-chat``
 - ``codeqwen1.5``, ``codeqwen1.5-chat``
 - ``qwen2.5``, ``qwen2.5-coder``, ``qwen2.5-instruct``, ``qwen2.5-coder-instruct``
diff --git a/xinference/core/chat_interface.py b/xinference/core/chat_interface.py
index 08b30ab054..b47461e18c 100644
--- a/xinference/core/chat_interface.py
+++ b/xinference/core/chat_interface.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import base64
+import html
 import logging
 import os
 from io import BytesIO
@@ -137,7 +138,11 @@ def generate_wrapper(
                 if "content" not in delta:
                     continue
                 else:
-                    response_content += delta["content"]
+                    # some model like deepseek-r1-distill-qwen
+                    # will generate <think>...</think> ...
+                    # in gradio, no output will be rendered,
+                    # thus escape html tags in advance
+                    response_content += html.escape(delta["content"])
                     yield response_content
 
             yield response_content
diff --git a/xinference/deploy/docker/requirements.txt b/xinference/deploy/docker/requirements.txt
index c9d1aa8de1..6a611e655b 100644
--- a/xinference/deploy/docker/requirements.txt
+++ b/xinference/deploy/docker/requirements.txt
@@ -113,5 +113,5 @@ pyzmq>=25.1.2
 torchao>=0.7.0
 uvloop
 xgrammar>=0.1.6
-vllm>=0.6.3.post1,<=0.6.4.post1
+vllm==0.6.4.post1
 cuda-python
diff --git a/xinference/model/llm/llm_family.json b/xinference/model/llm/llm_family.json
index 5088e0ef46..504a0fc293 100644
--- a/xinference/model/llm/llm_family.json
+++ b/xinference/model/llm/llm_family.json
@@ -8716,6 +8716,227 @@
       "<|im_end|>"
     ]
   },
+  {
+    "version": 1,
+    "context_length": 131072,
+    "model_name": "deepseek-r1-distill-qwen",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat"
+    ],
+    "model_description": "deepseek-r1-distill-qwen is distilled from DeepSeek-R1 based on Qwen",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": "1_5",
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
+      },
+      {
+        "model_format": "awq",
+        "model_size_in_billions": "1_5",
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "casperhansen/deepseek-r1-distill-qwen-1.5b-awq"
+      },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": "1_5",
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "jakiAJK/DeepSeek-R1-Distill-Qwen-1.5B_GPTQ-int4"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": "1_5",
+        "quantizations": [
+          "Q2_K",
+          "Q2_K_L",
+          "Q3_K_M",
+          "Q4_K_M",
+          "Q5_K_M",
+          "Q6_K",
+          "Q8_0"
+        ],
+        "model_id": "unsloth/DeepSeek-R1-Distill-Qwen-1.5B-GGUF",
+        "model_file_name_template": "DeepSeek-R1-Distill-Qwen-1.5B-{quantization}.gguf"
+      },
+      {
+        "model_format": "mlx",
+        "model_size_in_billions": "1_5",
+        "quantizations": [
+          "3bit",
+          "4bit",
+          "6bit",
+          "8bit",
+          "bf16"
+        ],
+        "model_id": "mlx-community/DeepSeek-R1-Distill-Qwen-1.5B-{quantization}"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
+      },
+      {
+        "model_format": "awq",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "jakiAJK/DeepSeek-R1-Distill-Qwen-7B_AWQ"
+      },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "jakiAJK/DeepSeek-R1-Distill-Qwen-7B_GPTQ-int4"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "Q2_K",
+          "Q2_K_L",
+          "Q3_K_M",
+          "Q4_K_M",
+          "Q5_K_M",
+          "Q6_K",
+          "Q8_0",
+          "F16"
+        ],
+        "model_id": "unsloth/DeepSeek-R1-Distill-Qwen-7B-GGUF",
+        "model_file_name_template": "DeepSeek-R1-Distill-Qwen-7B-{quantization}.gguf"
+      },
+      {
+        "model_format": "mlx",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "3bit",
+          "4bit",
+          "6bit",
+          "8bit",
+          "bf16"
+        ],
+        "model_id": "mlx-community/DeepSeek-R1-Distill-Qwen-7B-{quantization}"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 14,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B"
+      },
+      {
+        "model_format": "awq",
+        "model_size_in_billions": 14,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "casperhansen/deepseek-r1-distill-qwen-14b-awq"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 14,
+        "quantizations": [
+          "Q2_K",
+          "Q2_K_L",
+          "Q3_K_M",
+          "Q4_K_M",
+          "Q5_K_M",
+          "Q6_K",
+          "Q8_0",
+          "F16"
+        ],
+        "model_id": "unsloth/DeepSeek-R1-Distill-Qwen-14B-GGUF",
+        "model_file_name_template": "DeepSeek-R1-Distill-Qwen-14B-{quantization}.gguf"
+      },
+      {
+        "model_format": "mlx",
+        "model_size_in_billions": 14,
+        "quantizations": [
+          "3bit",
+          "4bit",
+          "6bit",
+          "8bit",
+          "bf16"
+        ],
+        "model_id": "mlx-community/DeepSeek-R1-Distill-Qwen-14B-{quantization}"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 32,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B"
+      },
+      {
+        "model_format": "awq",
+        "model_size_in_billions": 32,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "casperhansen/deepseek-r1-distill-qwen-32b-awq"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 32,
+        "quantizations": [
+          "Q2_K",
+          "Q2_K_L",
+          "Q3_K_M",
+          "Q4_K_M",
+          "Q5_K_M",
+          "Q6_K",
+          "Q8_0",
+          "F16"
+        ],
+        "model_id": "unsloth/DeepSeek-R1-Distill-Qwen-32B-GGUF",
+        "model_file_name_template": "DeepSeek-R1-Distill-Qwen-32B-{quantization}.gguf"
+      },
+      {
+        "model_format": "mlx",
+        "model_size_in_billions": 32,
+        "quantizations": [
+          "3bit",
+          "4bit",
+          "6bit",
+          "8bit",
+          "bf16"
+        ],
+        "model_id": "mlx-community/DeepSeek-R1-Distill-Qwen-32B-{quantization}"
+      }
+    ],
+    "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<｜User｜>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<｜Assistant｜><｜tool▁calls▁begin｜><｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<｜tool▁call▁end｜>'}}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<｜tool▁call▁end｜>'}}{{'<｜tool▁calls▁end｜><｜end▁of▁sentence｜>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<｜tool▁outputs▁end｜>' + message['content'] + '<｜end▁of▁sentence｜>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '</think>' in content %}{% set content = content.split('</think>')[-1] %}{% endif %}{{'<｜Assistant｜>' + content + '<｜end▁of▁sentence｜>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<｜tool▁outputs▁begin｜><｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\\n<｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<｜tool▁outputs▁end｜>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<｜Assistant｜>'}}{% endif %}",
+    "stop_token_ids": [
+      151643
+    ],
+    "stop": [
+      "<｜end▁of▁sentence｜>"
+    ]
+  },
   {
     "version": 1,
     "context_length": 8192,
diff --git a/xinference/model/llm/llm_family_modelscope.json b/xinference/model/llm/llm_family_modelscope.json
index 762bcdb690..9dd2390f5b 100644
--- a/xinference/model/llm/llm_family_modelscope.json
+++ b/xinference/model/llm/llm_family_modelscope.json
@@ -6433,6 +6433,194 @@
       "<|im_end|>"
     ]
   },
+  {
+    "version": 1,
+    "context_length": 131072,
+    "model_name": "deepseek-r1-distill-qwen",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat"
+    ],
+    "model_description": "deepseek-r1-distill-qwen is distilled from DeepSeek-R1 based on Qwen",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": "1_5",
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": "1_5",
+        "quantizations": [
+          "Q2_K",
+          "Q2_K_L",
+          "Q3_K_M",
+          "Q4_K_M",
+          "Q5_K_M",
+          "Q6_K",
+          "Q8_0"
+        ],
+        "model_id": "unsloth/DeepSeek-R1-Distill-Qwen-1.5B-GGUF",
+        "model_file_name_template": "DeepSeek-R1-Distill-Qwen-1.5B-{quantization}.gguf",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "tclf90/deepseek-r1-distill-qwen-7b-gptq-int4",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "Q2_K",
+          "Q2_K_L",
+          "Q3_K_M",
+          "Q4_K_M",
+          "Q5_K_M",
+          "Q6_K",
+          "Q8_0",
+          "F16"
+        ],
+        "model_id": "unsloth/DeepSeek-R1-Distill-Qwen-7B-GGUF",
+        "model_file_name_template": "DeepSeek-R1-Distill-Qwen-7B-{quantization}.gguf",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "mlx",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "3bit",
+          "4bit",
+          "6bit",
+          "8bit"
+        ],
+        "model_id": "okwinds/DeepSeek-R1-Distill-Qwen-7B-MLX-{quantization}",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 14,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 14,
+        "quantizations": [
+          "Q2_K",
+          "Q2_K_L",
+          "Q3_K_M",
+          "Q4_K_M",
+          "Q5_K_M",
+          "Q6_K",
+          "Q8_0",
+          "F16"
+        ],
+        "model_id": "unsloth/DeepSeek-R1-Distill-Qwen-14B-GGUF",
+        "model_file_name_template": "DeepSeek-R1-Distill-Qwen-14B-{quantization}.gguf",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "mlx",
+        "model_size_in_billions": 14,
+        "quantizations": [
+          "3bit",
+          "4bit",
+          "6bit",
+          "8bit"
+        ],
+        "model_id": "okwinds/DeepSeek-R1-Distill-Qwen-14B-MLX-{quantization}",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 32,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": 32,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "tclf90/deepseek-r1-distill-qwen-32b-gptq-int4",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 32,
+        "quantizations": [
+          "Q2_K",
+          "Q2_K_L",
+          "Q3_K_M",
+          "Q4_K_M",
+          "Q5_K_M",
+          "Q6_K",
+          "Q8_0",
+          "F16"
+        ],
+        "model_id": "unsloth/DeepSeek-R1-Distill-Qwen-32B-GGUF",
+        "model_file_name_template": "DeepSeek-R1-Distill-Qwen-32B-{quantization}.gguf",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "mlx",
+        "model_size_in_billions": 32,
+        "quantizations": [
+          "2bit",
+          "3bit",
+          "4bit",
+          "6bit",
+          "8bit"
+        ],
+        "model_id": "okwinds/DeepSeek-R1-Distill-Qwen-32B-MLX-{quantization}",
+        "model_hub": "modelscope"
+      }
+    ],
+    "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<｜User｜>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<｜Assistant｜><｜tool▁calls▁begin｜><｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<｜tool▁call▁end｜>'}}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<｜tool▁call▁end｜>'}}{{'<｜tool▁calls▁end｜><｜end▁of▁sentence｜>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<｜tool▁outputs▁end｜>' + message['content'] + '<｜end▁of▁sentence｜>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '</think>' in content %}{% set content = content.split('</think>')[-1] %}{% endif %}{{'<｜Assistant｜>' + content + '<｜end▁of▁sentence｜>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<｜tool▁outputs▁begin｜><｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\\n<｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<｜tool▁outputs▁end｜>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<｜Assistant｜>'}}{% endif %}",
+    "stop_token_ids": [
+      151643
+    ],
+    "stop": [
+      "<｜end▁of▁sentence｜>"
+    ]
+  },
   {
     "version": 1,
     "context_length": 8192,
diff --git a/xinference/model/llm/vllm/core.py b/xinference/model/llm/vllm/core.py
index b9a8e5388b..92fc2e6dbc 100644
--- a/xinference/model/llm/vllm/core.py
+++ b/xinference/model/llm/vllm/core.py
@@ -157,7 +157,7 @@ class VLLMGenerateConfig(TypedDict, total=False):
     VLLM_SUPPORTED_CHAT_MODELS.append("qwen2.5-coder-instruct")
     VLLM_SUPPORTED_CHAT_MODELS.append("QwQ-32B-Preview")
     VLLM_SUPPORTED_CHAT_MODELS.append("marco-o1")
-
+    VLLM_SUPPORTED_CHAT_MODELS.append("deepseek-r1-distill-qwen")
 
 if VLLM_INSTALLED and vllm.__version__ >= "0.3.2":
     VLLM_SUPPORTED_CHAT_MODELS.append("gemma-it")