Skip to content

Commit

Permalink
FEAT: qwen2 audio (#2271)
Browse files Browse the repository at this point in the history
  • Loading branch information
codingl2k1 authored Sep 14, 2024
1 parent 2666635 commit 961d355
Show file tree
Hide file tree
Showing 7 changed files with 391 additions and 1 deletion.
74 changes: 74 additions & 0 deletions xinference/core/tests/test_restful_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -1240,3 +1240,77 @@ def test_launch_model_by_version(setup):
# delete again
url = f"{endpoint}/v1/models/test_qwen15"
requests.delete(url)


@pytest.mark.skip(reason="Cost too many resources.")
def test_restful_api_for_qwen_audio(setup):
model_name = "qwen2-audio-instruct"

endpoint, _ = setup
url = f"{endpoint}/v1/models"

# list
response = requests.get(url)
response_data = response.json()
assert len(response_data["data"]) == 0

# launch
payload = {
"model_uid": "test_audio",
"model_name": model_name,
"model_engine": "transformers",
"model_size_in_billions": 7,
"model_format": "pytorch",
"quantization": "none",
}

response = requests.post(url, json=payload)
response_data = response.json()
model_uid_res = response_data["model_uid"]
assert model_uid_res == "test_audio"

response = requests.get(url)
response_data = response.json()
assert len(response_data["data"]) == 1

url = f"{endpoint}/v1/chat/completions"
payload = {
"model": model_uid_res,
"messages": [
{"role": "system", "content": "You are a helpful assistant."},
{
"role": "user",
"content": [
{
"type": "audio",
"audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/glass-breaking-151256.mp3",
},
{"type": "text", "text": "What's that sound?"},
],
},
{"role": "assistant", "content": "It is the sound of glass shattering."},
{
"role": "user",
"content": [
{"type": "text", "text": "What can you do when you hear that?"},
],
},
{
"role": "assistant",
"content": "Stay alert and cautious, and check if anyone is hurt or if there is any damage to property.",
},
{
"role": "user",
"content": [
{
"type": "audio",
"audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/1272-128104-0000.flac",
},
{"type": "text", "text": "What does the person say?"},
],
},
],
}
response = requests.post(url, json=payload)
completion = response.json()
assert len(completion["choices"][0]["message"]) > 0
2 changes: 2 additions & 0 deletions xinference/model/llm/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,7 @@ def _install():
from .transformers.internlm2 import Internlm2PytorchChatModel
from .transformers.minicpmv25 import MiniCPMV25Model
from .transformers.minicpmv26 import MiniCPMV26Model
from .transformers.qwen2_audio import Qwen2AudioChatModel
from .transformers.qwen2_vl import Qwen2VLChatModel
from .transformers.qwen_vl import QwenVLChatModel
from .transformers.yi_vl import YiVLChatModel
Expand Down Expand Up @@ -177,6 +178,7 @@ def _install():
Internlm2PytorchChatModel,
QwenVLChatModel,
Qwen2VLChatModel,
Qwen2AudioChatModel,
YiVLChatModel,
DeepSeekVLChatModel,
InternVLChatModel,
Expand Down
74 changes: 74 additions & 0 deletions xinference/model/llm/llm_family.json
Original file line number Diff line number Diff line change
Expand Up @@ -6947,6 +6947,80 @@
"</s>"
]
},
{
"version":1,
"context_length":32768,
"model_name":"qwen2-audio-instruct",
"model_lang":[
"en",
"zh"
],
"model_ability":[
"chat",
"audio"
],
"model_description":"Qwen2-Audio: A large-scale audio-language model which is capable of accepting various audio signal inputs and performing audio analysis or direct textual responses with regard to speech instructions.",
"model_specs":[
{
"model_format":"pytorch",
"model_size_in_billions":7,
"quantizations":[
"none"
],
"model_id":"Qwen/Qwen2-Audio-7B-Instruct",
"model_revision":"bac62d2c6808845904c709c17a0402d817558c64"
}
],
"prompt_style":{
"style_name":"QWEN",
"system_prompt":"You are a helpful assistant",
"roles":[
"user",
"assistant"
],
"stop": [
"<|im_end|>",
"<|endoftext|>"
]
}
},
{
"version":1,
"context_length":32768,
"model_name":"qwen2-audio",
"model_lang":[
"en",
"zh"
],
"model_ability":[
"chat",
"audio"
],
"model_description":"Qwen2-Audio: A large-scale audio-language model which is capable of accepting various audio signal inputs and performing audio analysis or direct textual responses with regard to speech instructions.",
"model_specs":[
{
"model_format":"pytorch",
"model_size_in_billions":7,
"quantizations":[
"none"
],
"model_id":"Qwen/Qwen2-Audio-7B",
"model_revision":"8577bc71d330c8fa32ffe9f8a1374100759f2466"
}
],
"prompt_style":{
"style_name":"QWEN",
"system_prompt":"You are a helpful assistant",
"roles":[
"user",
"assistant"
],
"stop": [
"<|im_end|>",
"<|endoftext|>"
]
}
},
{
"version": 1,
"context_length": 128000,
Expand Down
4 changes: 3 additions & 1 deletion xinference/model/llm/llm_family.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,9 @@ class LLMFamilyV1(BaseModel):
context_length: Optional[int] = DEFAULT_CONTEXT_LENGTH
model_name: str
model_lang: List[str]
model_ability: List[Literal["embed", "generate", "chat", "tools", "vision"]]
model_ability: List[
Literal["embed", "generate", "chat", "tools", "vision", "audio"]
]
model_description: Optional[str]
# reason for not required str here: legacy registration
model_family: Optional[str]
Expand Down
68 changes: 68 additions & 0 deletions xinference/model/llm/llm_family_modelscope.json
Original file line number Diff line number Diff line change
Expand Up @@ -4656,6 +4656,74 @@
"</s>"
]
},
{
"version": 1,
"context_length": 32768,
"model_name": "qwen2-audio-instruct",
"model_lang": [
"en",
"zh"
],
"model_ability": [
"chat",
"audio"
],
"model_description": "Qwen2-Audio: A large-scale audio-language model which is capable of accepting various audio signal inputs and performing audio analysis or direct textual responses with regard to speech instructions.",
"model_specs": [
{
"model_format": "pytorch",
"model_size_in_billions": 7,
"quantizations": [
"none"
],
"model_hub": "modelscope",
"model_id": "qwen/Qwen2-Audio-7B-Instruct",
"model_revision": "master"
}
],
"prompt_style": {
"style_name": "QWEN",
"system_prompt": "You are a helpful assistant",
"roles": [
"user",
"assistant"
]
}
},
{
"version": 1,
"context_length": 32768,
"model_name": "qwen2-audio",
"model_lang": [
"en",
"zh"
],
"model_ability": [
"chat",
"audio"
],
"model_description": "Qwen2-Audio: A large-scale audio-language model which is capable of accepting various audio signal inputs and performing audio analysis or direct textual responses with regard to speech instructions.",
"model_specs": [
{
"model_format": "pytorch",
"model_size_in_billions": 7,
"quantizations": [
"none"
],
"model_hub": "modelscope",
"model_id": "qwen/Qwen2-Audio-7B",
"model_revision": "master"
}
],
"prompt_style": {
"style_name": "QWEN",
"system_prompt": "You are a helpful assistant",
"roles": [
"user",
"assistant"
]
}
},
{
"version": 1,
"context_length": 128000,
Expand Down
2 changes: 2 additions & 0 deletions xinference/model/llm/transformers/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,8 @@
"MiniCPM-V-2.6",
"glm-4v",
"qwen2-vl-instruct",
"qwen2-audio",
"qwen2-audio-instruct",
"deepseek-v2",
"deepseek-v2-chat",
"deepseek-v2.5",
Expand Down
Loading

0 comments on commit 961d355

Please sign in to comment.