From e9714f1d6685d274f3d6e3cd1c8c5259fe17c9ac Mon Sep 17 00:00:00 2001 From: Minamiyama Date: Sat, 12 Oct 2024 13:27:02 +0800 Subject: [PATCH 1/5] ENH: cosyvoice support pt file --- .../thirdparty/cosyvoice/cli/cosyvoice.py | 37 ++++++++++++++++++- 1 file changed, 35 insertions(+), 2 deletions(-) diff --git a/xinference/thirdparty/cosyvoice/cli/cosyvoice.py b/xinference/thirdparty/cosyvoice/cli/cosyvoice.py index 49fe15f6c7..dfbd662328 100644 --- a/xinference/thirdparty/cosyvoice/cli/cosyvoice.py +++ b/xinference/thirdparty/cosyvoice/cli/cosyvoice.py @@ -18,10 +18,13 @@ from cosyvoice.cli.frontend import CosyVoiceFrontEnd from cosyvoice.cli.model import CosyVoiceModel from cosyvoice.utils.file_utils import logging +import torch class CosyVoice: def __init__(self, model_dir, load_jit=True): + self.default_voices = ['中文女', '中文男', '日语男', '粤语女', '英文女', '英文男', '韩语女'] + instruct = True if '-Instruct' in model_dir else False self.model_dir = model_dir if not os.path.exists(model_dir): @@ -50,7 +53,24 @@ def list_avaliable_spks(self): def inference_sft(self, tts_text, spk_id, stream=False): for i in self.frontend.text_normalize(tts_text, split=True): - model_input = self.frontend.frontend_sft(i, spk_id) + if spk_id not in self.default_voices and os.environ["cosyvoice_pt_path"] is not None: + model_input = self.frontend.frontend_sft(i, "中文女") + newspk = torch.load(f'{os.environ["cosyvoice_pt_path"]}/{spk_id}.pt') + model_input["flow_embedding"] = newspk["flow_embedding"] + model_input["llm_embedding"] = newspk["llm_embedding"] + + model_input["llm_prompt_speech_token"] = newspk["llm_prompt_speech_token"] + model_input["llm_prompt_speech_token_len"] = newspk["llm_prompt_speech_token_len"] + + model_input["flow_prompt_speech_token"] = newspk["flow_prompt_speech_token"] + model_input["flow_prompt_speech_token_len"] = newspk["flow_prompt_speech_token_len"] + + model_input["prompt_speech_feat_len"] = newspk["prompt_speech_feat_len"] + model_input["prompt_speech_feat"] = newspk["prompt_speech_feat"] + model_input["prompt_text"] = newspk["prompt_text"] + model_input["prompt_text_len"] = newspk["prompt_text_len"] + else: + model_input = self.frontend.frontend_sft(i, spk_id) start_time = time.time() logging.info('synthesis text {}'.format(i)) for model_output in self.model.inference(**model_input, stream=stream): @@ -89,7 +109,20 @@ def inference_instruct(self, tts_text, spk_id, instruct_text, stream=False): raise ValueError('{} do not support instruct inference'.format(self.model_dir)) instruct_text = self.frontend.text_normalize(instruct_text, split=False) for i in self.frontend.text_normalize(tts_text, split=True): - model_input = self.frontend.frontend_instruct(i, spk_id, instruct_text) + if spk_id not in self.default_voices and os.environ["cosyvoice_pt_path"] is not None: + model_input = self.frontend.frontend_instruct(i, "中文女", instruct_text) + newspk = torch.load(f'{os.environ["cosyvoice_pt_path"]}/{spk_id}.pt') + + model_input["flow_embedding"] = newspk["flow_embedding"] + model_input["llm_embedding"] = newspk["llm_embedding"] + + model_input["llm_prompt_speech_token"] = newspk["llm_prompt_speech_token"] + model_input["llm_prompt_speech_token_len"] = newspk["llm_prompt_speech_token_len"] + + model_input["flow_prompt_speech_token"] = newspk["flow_prompt_speech_token"] + model_input["flow_prompt_speech_token_len"] = newspk["flow_prompt_speech_token_len"] + else: + model_input = self.frontend.frontend_instruct(i, spk_id, instruct_text) start_time = time.time() logging.info('synthesis text {}'.format(i)) for model_output in self.model.inference(**model_input, stream=stream): From b36bb0f21eb9fae06a4d914e9f7b5a9a61fd5af9 Mon Sep 17 00:00:00 2001 From: Minamiyama Date: Sat, 12 Oct 2024 13:42:48 +0800 Subject: [PATCH 2/5] change env key to capital form --- xinference/thirdparty/cosyvoice/cli/cosyvoice.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/xinference/thirdparty/cosyvoice/cli/cosyvoice.py b/xinference/thirdparty/cosyvoice/cli/cosyvoice.py index dfbd662328..e4afbdd8ef 100644 --- a/xinference/thirdparty/cosyvoice/cli/cosyvoice.py +++ b/xinference/thirdparty/cosyvoice/cli/cosyvoice.py @@ -53,9 +53,9 @@ def list_avaliable_spks(self): def inference_sft(self, tts_text, spk_id, stream=False): for i in self.frontend.text_normalize(tts_text, split=True): - if spk_id not in self.default_voices and os.environ["cosyvoice_pt_path"] is not None: + if spk_id not in self.default_voices and os.environ["COSYVOICE_PT_PATH"] is not None: model_input = self.frontend.frontend_sft(i, "中文女") - newspk = torch.load(f'{os.environ["cosyvoice_pt_path"]}/{spk_id}.pt') + newspk = torch.load(f'{os.environ["COSYVOICE_PT_PATH"]}/{spk_id}.pt') model_input["flow_embedding"] = newspk["flow_embedding"] model_input["llm_embedding"] = newspk["llm_embedding"] @@ -109,9 +109,9 @@ def inference_instruct(self, tts_text, spk_id, instruct_text, stream=False): raise ValueError('{} do not support instruct inference'.format(self.model_dir)) instruct_text = self.frontend.text_normalize(instruct_text, split=False) for i in self.frontend.text_normalize(tts_text, split=True): - if spk_id not in self.default_voices and os.environ["cosyvoice_pt_path"] is not None: + if spk_id not in self.default_voices and os.environ["COSYVOICE_PT_PATH"] is not None: model_input = self.frontend.frontend_instruct(i, "中文女", instruct_text) - newspk = torch.load(f'{os.environ["cosyvoice_pt_path"]}/{spk_id}.pt') + newspk = torch.load(f'{os.environ["COSYVOICE_PT_PATH"]}/{spk_id}.pt') model_input["flow_embedding"] = newspk["flow_embedding"] model_input["llm_embedding"] = newspk["llm_embedding"] From fcb5b6ad11535c6763809332df50377e0ebc05a2 Mon Sep 17 00:00:00 2001 From: Minamiyama Date: Sat, 12 Oct 2024 15:58:25 +0800 Subject: [PATCH 3/5] optimize for loading load --- xinference/thirdparty/cosyvoice/cli/cosyvoice.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/xinference/thirdparty/cosyvoice/cli/cosyvoice.py b/xinference/thirdparty/cosyvoice/cli/cosyvoice.py index e4afbdd8ef..873e51feb0 100644 --- a/xinference/thirdparty/cosyvoice/cli/cosyvoice.py +++ b/xinference/thirdparty/cosyvoice/cli/cosyvoice.py @@ -52,10 +52,12 @@ def list_avaliable_spks(self): return spks def inference_sft(self, tts_text, spk_id, stream=False): + if spk_id not in self.default_voices and os.environ["COSYVOICE_PT_PATH"] is not None: + newspk = torch.load(f'{os.environ["COSYVOICE_PT_PATH"]}/{spk_id}.pt') for i in self.frontend.text_normalize(tts_text, split=True): - if spk_id not in self.default_voices and os.environ["COSYVOICE_PT_PATH"] is not None: + if newspk is not None: model_input = self.frontend.frontend_sft(i, "中文女") - newspk = torch.load(f'{os.environ["COSYVOICE_PT_PATH"]}/{spk_id}.pt') + model_input["flow_embedding"] = newspk["flow_embedding"] model_input["llm_embedding"] = newspk["llm_embedding"] @@ -107,11 +109,12 @@ def inference_cross_lingual(self, tts_text, prompt_speech_16k, stream=False): def inference_instruct(self, tts_text, spk_id, instruct_text, stream=False): if self.frontend.instruct is False: raise ValueError('{} do not support instruct inference'.format(self.model_dir)) + if spk_id not in self.default_voices and os.environ["COSYVOICE_PT_PATH"] is not None: + newspk = torch.load(f'{os.environ["COSYVOICE_PT_PATH"]}/{spk_id}.pt') instruct_text = self.frontend.text_normalize(instruct_text, split=False) for i in self.frontend.text_normalize(tts_text, split=True): - if spk_id not in self.default_voices and os.environ["COSYVOICE_PT_PATH"] is not None: + if newspk is not None: model_input = self.frontend.frontend_instruct(i, "中文女", instruct_text) - newspk = torch.load(f'{os.environ["COSYVOICE_PT_PATH"]}/{spk_id}.pt') model_input["flow_embedding"] = newspk["flow_embedding"] model_input["llm_embedding"] = newspk["llm_embedding"] From 787e2397826edd751f830132552b18c60d4bb4e8 Mon Sep 17 00:00:00 2001 From: Minamiyama Date: Sat, 12 Oct 2024 16:21:33 +0800 Subject: [PATCH 4/5] pt cache optimize --- xinference/thirdparty/cosyvoice/cli/cosyvoice.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/xinference/thirdparty/cosyvoice/cli/cosyvoice.py b/xinference/thirdparty/cosyvoice/cli/cosyvoice.py index 873e51feb0..bcd42d53b4 100644 --- a/xinference/thirdparty/cosyvoice/cli/cosyvoice.py +++ b/xinference/thirdparty/cosyvoice/cli/cosyvoice.py @@ -24,6 +24,7 @@ class CosyVoice: def __init__(self, model_dir, load_jit=True): self.default_voices = ['中文女', '中文男', '日语男', '粤语女', '英文女', '英文男', '韩语女'] + self.pt_cache = {} instruct = True if '-Instruct' in model_dir else False self.model_dir = model_dir @@ -53,7 +54,9 @@ def list_avaliable_spks(self): def inference_sft(self, tts_text, spk_id, stream=False): if spk_id not in self.default_voices and os.environ["COSYVOICE_PT_PATH"] is not None: - newspk = torch.load(f'{os.environ["COSYVOICE_PT_PATH"]}/{spk_id}.pt') + if spk_id not in self.pt_cache: + self.pt_cache[spk_id] = torch.load(f'{os.environ["COSYVOICE_PT_PATH"]}/{spk_id}.pt') + newspk = self.pt_cache[spk_id] for i in self.frontend.text_normalize(tts_text, split=True): if newspk is not None: model_input = self.frontend.frontend_sft(i, "中文女") @@ -109,8 +112,9 @@ def inference_cross_lingual(self, tts_text, prompt_speech_16k, stream=False): def inference_instruct(self, tts_text, spk_id, instruct_text, stream=False): if self.frontend.instruct is False: raise ValueError('{} do not support instruct inference'.format(self.model_dir)) - if spk_id not in self.default_voices and os.environ["COSYVOICE_PT_PATH"] is not None: - newspk = torch.load(f'{os.environ["COSYVOICE_PT_PATH"]}/{spk_id}.pt') + if spk_id not in self.pt_cache: + self.pt_cache[spk_id] = torch.load(f'{os.environ["COSYVOICE_PT_PATH"]}/{spk_id}.pt') + newspk = self.pt_cache[spk_id] instruct_text = self.frontend.text_normalize(instruct_text, split=False) for i in self.frontend.text_normalize(tts_text, split=True): if newspk is not None: From dc5cf065c23f868a157b3a55c14cf922142ca402 Mon Sep 17 00:00:00 2001 From: Minamiyama Date: Mon, 14 Oct 2024 15:51:11 +0800 Subject: [PATCH 5/5] add usage doc --- doc/source/models/model_abilities/audio.rst | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/doc/source/models/model_abilities/audio.rst b/doc/source/models/model_abilities/audio.rst index d6731913d8..368b45e2a1 100644 --- a/doc/source/models/model_abilities/audio.rst +++ b/doc/source/models/model_abilities/audio.rst @@ -19,7 +19,7 @@ The Audio API provides three methods for interacting with audio: * The speech endpoint generates audio from the input text. -.. list-table:: +.. list-table:: :widths: 25 50 :header-rows: 1 @@ -91,7 +91,7 @@ We can try Transcription API out either via cURL, OpenAI Client, or Xinference's import openai client = openai.Client( - api_key="cannot be empty", + api_key="cannot be empty", base_url="http://:/v1" ) with open("speech.mp3", "rb") as audio_file: @@ -270,6 +270,7 @@ CosyVoice Usage ~~~~~~~~~~~~~~~ Basic usage, launch model ``CosyVoice-300M-SFT``. +PS. If you have other persistent ``.pt`` files for cloned voices, you can set the path of the folder containing the ``.pt`` files in the environment variable ``SOCYVOICE_PT_PATH``. .. tabs:: @@ -282,7 +283,7 @@ Basic usage, launch model ``CosyVoice-300M-SFT``. -d '{ "model": "", "input": "", - # ['中文女', '中文男', '日语男', '粤语女', '英文女', '英文男', '韩语女'] + # ['中文女', '中文男', '日语男', '粤语女', '英文女', '英文男', '韩语女', other voice you put in ``SOCYVOICE_PT_PATH``] "voice": "中文女" }' @@ -297,7 +298,7 @@ Basic usage, launch model ``CosyVoice-300M-SFT``. response = client.audio.speech.create( model=, input=, - # ['中文女', '中文男', '日语男', '粤语女', '英文女', '英文男', '韩语女'] + # ['中文女', '中文男', '日语男', '粤语女', '英文女', '英文男', '韩语女', other voice you put in ``SOCYVOICE_PT_PATH``] voice="中文女", ) response.stream_to_file('1.mp3') @@ -311,7 +312,7 @@ Basic usage, launch model ``CosyVoice-300M-SFT``. model = client.get_model("") speech_bytes = model.speech( input=, - # ['中文女', '中文男', '日语男', '粤语女', '英文女', '英文男', '韩语女'] + # ['中文女', '中文男', '日语男', '粤语女', '英文女', '英文男', '韩语女', other voice you put in ``SOCYVOICE_PT_PATH``] voice="中文女" ) with open('1.mp3', 'wb') as f: