From 513af9d8dc2c25cee6b90ded9b37bcd8102689cf Mon Sep 17 00:00:00 2001 From: Jun-Howie <62869005+Jun-Howie@users.noreply.github.com> Date: Fri, 27 Dec 2024 18:12:03 +0800 Subject: [PATCH] FEAT: Support QvQ-72B-Preview (#2712) Co-authored-by: JunHowie --- xinference/model/llm/llm_family.json | 47 ++++++++++++++++++ .../model/llm/llm_family_modelscope.json | 49 +++++++++++++++++++ xinference/model/llm/transformers/core.py | 1 + xinference/model/llm/transformers/qwen2_vl.py | 2 + xinference/model/llm/vllm/core.py | 1 + 5 files changed, 100 insertions(+) diff --git a/xinference/model/llm/llm_family.json b/xinference/model/llm/llm_family.json index a28e636fb7..e56bf43d18 100644 --- a/xinference/model/llm/llm_family.json +++ b/xinference/model/llm/llm_family.json @@ -8942,5 +8942,52 @@ "<|user|>", "<|observation|>" ] + }, + { + "version": 1, + "context_length": 32768, + "model_name": "QvQ-72B-Preview", + "model_lang": [ + "en", + "zh" + ], + "model_ability": [ + "chat", + "vision" + ], + "model_description": "QVQ-72B-Preview is an experimental research model developed by the Qwen team, focusing on enhancing visual reasoning capabilities.", + "model_specs": [ + { + "model_format": "pytorch", + "model_size_in_billions": 72, + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "Qwen/QVQ-72B-Preview" + }, + { + "model_format": "mlx", + "model_size_in_billions": 72, + "quantizations": [ + "3bit", + "4bit", + "6bit", + "8bit", + "bf16" + ], + "model_id": "mlx-community/QVQ-72B-Preview-{quantization}" + } + ], + "chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful and harmless assistant. You are Qwen developed by Alibaba. You should think step-by-step.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}", + "stop_token_ids": [ + 151645, + 151643 + ], + "stop": [ + "<|im_end|>", + "<|endoftext|>" + ] } ] diff --git a/xinference/model/llm/llm_family_modelscope.json b/xinference/model/llm/llm_family_modelscope.json index 55009787dd..4fce8a531a 100644 --- a/xinference/model/llm/llm_family_modelscope.json +++ b/xinference/model/llm/llm_family_modelscope.json @@ -6673,5 +6673,54 @@ "<|user|>", "<|observation|>" ] + }, + { + "version": 1, + "context_length": 32768, + "model_name": "QvQ-72B-Preview", + "model_lang": [ + "en", + "zh" + ], + "model_ability": [ + "chat", + "vision" + ], + "model_description": "QVQ-72B-Preview is an experimental research model developed by the Qwen team, focusing on enhancing visual reasoning capabilities.", + "model_specs": [ + { + "model_format": "pytorch", + "model_size_in_billions": 72, + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "Qwen/QVQ-72B-Preview", + "model_hub": "modelscope" + }, + { + "model_format": "mlx", + "model_size_in_billions": 72, + "quantizations": [ + "3bit", + "4bit", + "6bit", + "8bit", + "bf16" + ], + "model_id": "mlx-community/QVQ-72B-Preview-{quantization}", + "model_hub": "modelscope" + } + ], + "chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful and harmless assistant. You are Qwen developed by Alibaba. You should think step-by-step.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}", + "stop_token_ids": [ + 151645, + 151643 + ], + "stop": [ + "<|im_end|>", + "<|endoftext|>" + ] } ] diff --git a/xinference/model/llm/transformers/core.py b/xinference/model/llm/transformers/core.py index 1494ec88ad..25b76db7ff 100644 --- a/xinference/model/llm/transformers/core.py +++ b/xinference/model/llm/transformers/core.py @@ -69,6 +69,7 @@ "deepseek-v2.5", "deepseek-v2-chat-0628", "glm-edge-v", + "QvQ-72B-Preview", ] diff --git a/xinference/model/llm/transformers/qwen2_vl.py b/xinference/model/llm/transformers/qwen2_vl.py index 900f261113..45b5917913 100644 --- a/xinference/model/llm/transformers/qwen2_vl.py +++ b/xinference/model/llm/transformers/qwen2_vl.py @@ -47,6 +47,8 @@ def match( llm_family = model_family.model_family or model_family.model_name if "qwen2-vl-instruct".lower() in llm_family.lower(): return True + if "qvq-72b-preview".lower() in llm_family.lower(): + return True return False def load(self): diff --git a/xinference/model/llm/vllm/core.py b/xinference/model/llm/vllm/core.py index dc5376bd84..d1a94abadc 100644 --- a/xinference/model/llm/vllm/core.py +++ b/xinference/model/llm/vllm/core.py @@ -195,6 +195,7 @@ class VLLMGenerateConfig(TypedDict, total=False): VLLM_SUPPORTED_MODELS.append("llama-3.2-vision") VLLM_SUPPORTED_VISION_MODEL_LIST.append("llama-3.2-vision-instruct") VLLM_SUPPORTED_VISION_MODEL_LIST.append("qwen2-vl-instruct") + VLLM_SUPPORTED_VISION_MODEL_LIST.append("QvQ-72B-Preview") class VLLMModel(LLM):