From e9714f1d6685d274f3d6e3cd1c8c5259fe17c9ac Mon Sep 17 00:00:00 2001
From: Minamiyama <minamiyama@qq.com>
Date: Sat, 12 Oct 2024 13:27:02 +0800
Subject: [PATCH 1/5] ENH: cosyvoice support pt file

---
 .../thirdparty/cosyvoice/cli/cosyvoice.py     | 37 ++++++++++++++++++-
 1 file changed, 35 insertions(+), 2 deletions(-)

diff --git a/xinference/thirdparty/cosyvoice/cli/cosyvoice.py b/xinference/thirdparty/cosyvoice/cli/cosyvoice.py
index 49fe15f6c7..dfbd662328 100644
--- a/xinference/thirdparty/cosyvoice/cli/cosyvoice.py
+++ b/xinference/thirdparty/cosyvoice/cli/cosyvoice.py
@@ -18,10 +18,13 @@
 from cosyvoice.cli.frontend import CosyVoiceFrontEnd
 from cosyvoice.cli.model import CosyVoiceModel
 from cosyvoice.utils.file_utils import logging
+import torch
 
 class CosyVoice:
 
     def __init__(self, model_dir, load_jit=True):
+        self.default_voices = ['中文女', '中文男', '日语男', '粤语女', '英文女', '英文男', '韩语女']
+
         instruct = True if '-Instruct' in model_dir else False
         self.model_dir = model_dir
         if not os.path.exists(model_dir):
@@ -50,7 +53,24 @@ def list_avaliable_spks(self):
 
     def inference_sft(self, tts_text, spk_id, stream=False):
         for i in self.frontend.text_normalize(tts_text, split=True):
-            model_input = self.frontend.frontend_sft(i, spk_id)
+            if spk_id not in self.default_voices and os.environ["cosyvoice_pt_path"] is not None:
+                model_input = self.frontend.frontend_sft(i, "中文女")
+                newspk = torch.load(f'{os.environ["cosyvoice_pt_path"]}/{spk_id}.pt')
+                model_input["flow_embedding"] = newspk["flow_embedding"]
+                model_input["llm_embedding"] = newspk["llm_embedding"]
+
+                model_input["llm_prompt_speech_token"] = newspk["llm_prompt_speech_token"]
+                model_input["llm_prompt_speech_token_len"] = newspk["llm_prompt_speech_token_len"]
+
+                model_input["flow_prompt_speech_token"] = newspk["flow_prompt_speech_token"]
+                model_input["flow_prompt_speech_token_len"] = newspk["flow_prompt_speech_token_len"]
+
+                model_input["prompt_speech_feat_len"] = newspk["prompt_speech_feat_len"]
+                model_input["prompt_speech_feat"] = newspk["prompt_speech_feat"]
+                model_input["prompt_text"] = newspk["prompt_text"]
+                model_input["prompt_text_len"] = newspk["prompt_text_len"]
+            else:
+                model_input = self.frontend.frontend_sft(i, spk_id)
             start_time = time.time()
             logging.info('synthesis text {}'.format(i))
             for model_output in self.model.inference(**model_input, stream=stream):
@@ -89,7 +109,20 @@ def inference_instruct(self, tts_text, spk_id, instruct_text, stream=False):
             raise ValueError('{} do not support instruct inference'.format(self.model_dir))
         instruct_text = self.frontend.text_normalize(instruct_text, split=False)
         for i in self.frontend.text_normalize(tts_text, split=True):
-            model_input = self.frontend.frontend_instruct(i, spk_id, instruct_text)
+            if spk_id not in self.default_voices and os.environ["cosyvoice_pt_path"] is not None:
+                model_input = self.frontend.frontend_instruct(i, "中文女", instruct_text)
+                newspk = torch.load(f'{os.environ["cosyvoice_pt_path"]}/{spk_id}.pt')
+
+                model_input["flow_embedding"] = newspk["flow_embedding"]
+                model_input["llm_embedding"] = newspk["llm_embedding"]
+
+                model_input["llm_prompt_speech_token"] = newspk["llm_prompt_speech_token"]
+                model_input["llm_prompt_speech_token_len"] = newspk["llm_prompt_speech_token_len"]
+
+                model_input["flow_prompt_speech_token"] = newspk["flow_prompt_speech_token"]
+                model_input["flow_prompt_speech_token_len"] = newspk["flow_prompt_speech_token_len"]
+            else:
+                model_input = self.frontend.frontend_instruct(i, spk_id, instruct_text)
             start_time = time.time()
             logging.info('synthesis text {}'.format(i))
             for model_output in self.model.inference(**model_input, stream=stream):

From b36bb0f21eb9fae06a4d914e9f7b5a9a61fd5af9 Mon Sep 17 00:00:00 2001
From: Minamiyama <minamiyama@qq.com>
Date: Sat, 12 Oct 2024 13:42:48 +0800
Subject: [PATCH 2/5] change env key to capital form

---
 xinference/thirdparty/cosyvoice/cli/cosyvoice.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/xinference/thirdparty/cosyvoice/cli/cosyvoice.py b/xinference/thirdparty/cosyvoice/cli/cosyvoice.py
index dfbd662328..e4afbdd8ef 100644
--- a/xinference/thirdparty/cosyvoice/cli/cosyvoice.py
+++ b/xinference/thirdparty/cosyvoice/cli/cosyvoice.py
@@ -53,9 +53,9 @@ def list_avaliable_spks(self):
 
     def inference_sft(self, tts_text, spk_id, stream=False):
         for i in self.frontend.text_normalize(tts_text, split=True):
-            if spk_id not in self.default_voices and os.environ["cosyvoice_pt_path"] is not None:
+            if spk_id not in self.default_voices and os.environ["COSYVOICE_PT_PATH"] is not None:
                 model_input = self.frontend.frontend_sft(i, "中文女")
-                newspk = torch.load(f'{os.environ["cosyvoice_pt_path"]}/{spk_id}.pt')
+                newspk = torch.load(f'{os.environ["COSYVOICE_PT_PATH"]}/{spk_id}.pt')
                 model_input["flow_embedding"] = newspk["flow_embedding"]
                 model_input["llm_embedding"] = newspk["llm_embedding"]
 
@@ -109,9 +109,9 @@ def inference_instruct(self, tts_text, spk_id, instruct_text, stream=False):
             raise ValueError('{} do not support instruct inference'.format(self.model_dir))
         instruct_text = self.frontend.text_normalize(instruct_text, split=False)
         for i in self.frontend.text_normalize(tts_text, split=True):
-            if spk_id not in self.default_voices and os.environ["cosyvoice_pt_path"] is not None:
+            if spk_id not in self.default_voices and os.environ["COSYVOICE_PT_PATH"] is not None:
                 model_input = self.frontend.frontend_instruct(i, "中文女", instruct_text)
-                newspk = torch.load(f'{os.environ["cosyvoice_pt_path"]}/{spk_id}.pt')
+                newspk = torch.load(f'{os.environ["COSYVOICE_PT_PATH"]}/{spk_id}.pt')
 
                 model_input["flow_embedding"] = newspk["flow_embedding"]
                 model_input["llm_embedding"] = newspk["llm_embedding"]

From fcb5b6ad11535c6763809332df50377e0ebc05a2 Mon Sep 17 00:00:00 2001
From: Minamiyama <minamiyama@qq.com>
Date: Sat, 12 Oct 2024 15:58:25 +0800
Subject: [PATCH 3/5] optimize for loading load

---
 xinference/thirdparty/cosyvoice/cli/cosyvoice.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/xinference/thirdparty/cosyvoice/cli/cosyvoice.py b/xinference/thirdparty/cosyvoice/cli/cosyvoice.py
index e4afbdd8ef..873e51feb0 100644
--- a/xinference/thirdparty/cosyvoice/cli/cosyvoice.py
+++ b/xinference/thirdparty/cosyvoice/cli/cosyvoice.py
@@ -52,10 +52,12 @@ def list_avaliable_spks(self):
         return spks
 
     def inference_sft(self, tts_text, spk_id, stream=False):
+        if spk_id not in self.default_voices and os.environ["COSYVOICE_PT_PATH"] is not None:
+            newspk = torch.load(f'{os.environ["COSYVOICE_PT_PATH"]}/{spk_id}.pt')
         for i in self.frontend.text_normalize(tts_text, split=True):
-            if spk_id not in self.default_voices and os.environ["COSYVOICE_PT_PATH"] is not None:
+            if newspk is not None:
                 model_input = self.frontend.frontend_sft(i, "中文女")
-                newspk = torch.load(f'{os.environ["COSYVOICE_PT_PATH"]}/{spk_id}.pt')
+
                 model_input["flow_embedding"] = newspk["flow_embedding"]
                 model_input["llm_embedding"] = newspk["llm_embedding"]
 
@@ -107,11 +109,12 @@ def inference_cross_lingual(self, tts_text, prompt_speech_16k, stream=False):
     def inference_instruct(self, tts_text, spk_id, instruct_text, stream=False):
         if self.frontend.instruct is False:
             raise ValueError('{} do not support instruct inference'.format(self.model_dir))
+        if spk_id not in self.default_voices and os.environ["COSYVOICE_PT_PATH"] is not None:
+            newspk = torch.load(f'{os.environ["COSYVOICE_PT_PATH"]}/{spk_id}.pt')
         instruct_text = self.frontend.text_normalize(instruct_text, split=False)
         for i in self.frontend.text_normalize(tts_text, split=True):
-            if spk_id not in self.default_voices and os.environ["COSYVOICE_PT_PATH"] is not None:
+            if newspk is not None:
                 model_input = self.frontend.frontend_instruct(i, "中文女", instruct_text)
-                newspk = torch.load(f'{os.environ["COSYVOICE_PT_PATH"]}/{spk_id}.pt')
 
                 model_input["flow_embedding"] = newspk["flow_embedding"]
                 model_input["llm_embedding"] = newspk["llm_embedding"]

From 787e2397826edd751f830132552b18c60d4bb4e8 Mon Sep 17 00:00:00 2001
From: Minamiyama <minamiyama@qq.com>
Date: Sat, 12 Oct 2024 16:21:33 +0800
Subject: [PATCH 4/5] pt cache optimize

---
 xinference/thirdparty/cosyvoice/cli/cosyvoice.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/xinference/thirdparty/cosyvoice/cli/cosyvoice.py b/xinference/thirdparty/cosyvoice/cli/cosyvoice.py
index 873e51feb0..bcd42d53b4 100644
--- a/xinference/thirdparty/cosyvoice/cli/cosyvoice.py
+++ b/xinference/thirdparty/cosyvoice/cli/cosyvoice.py
@@ -24,6 +24,7 @@ class CosyVoice:
 
     def __init__(self, model_dir, load_jit=True):
         self.default_voices = ['中文女', '中文男', '日语男', '粤语女', '英文女', '英文男', '韩语女']
+        self.pt_cache = {}
 
         instruct = True if '-Instruct' in model_dir else False
         self.model_dir = model_dir
@@ -53,7 +54,9 @@ def list_avaliable_spks(self):
 
     def inference_sft(self, tts_text, spk_id, stream=False):
         if spk_id not in self.default_voices and os.environ["COSYVOICE_PT_PATH"] is not None:
-            newspk = torch.load(f'{os.environ["COSYVOICE_PT_PATH"]}/{spk_id}.pt')
+            if spk_id not in self.pt_cache:
+                self.pt_cache[spk_id] = torch.load(f'{os.environ["COSYVOICE_PT_PATH"]}/{spk_id}.pt')
+            newspk = self.pt_cache[spk_id]
         for i in self.frontend.text_normalize(tts_text, split=True):
             if newspk is not None:
                 model_input = self.frontend.frontend_sft(i, "中文女")
@@ -109,8 +112,9 @@ def inference_cross_lingual(self, tts_text, prompt_speech_16k, stream=False):
     def inference_instruct(self, tts_text, spk_id, instruct_text, stream=False):
         if self.frontend.instruct is False:
             raise ValueError('{} do not support instruct inference'.format(self.model_dir))
-        if spk_id not in self.default_voices and os.environ["COSYVOICE_PT_PATH"] is not None:
-            newspk = torch.load(f'{os.environ["COSYVOICE_PT_PATH"]}/{spk_id}.pt')
+        if spk_id not in self.pt_cache:
+            self.pt_cache[spk_id] = torch.load(f'{os.environ["COSYVOICE_PT_PATH"]}/{spk_id}.pt')
+        newspk = self.pt_cache[spk_id]
         instruct_text = self.frontend.text_normalize(instruct_text, split=False)
         for i in self.frontend.text_normalize(tts_text, split=True):
             if newspk is not None:

From dc5cf065c23f868a157b3a55c14cf922142ca402 Mon Sep 17 00:00:00 2001
From: Minamiyama <minamiyama@qq.com>
Date: Mon, 14 Oct 2024 15:51:11 +0800
Subject: [PATCH 5/5] add usage doc

---
 doc/source/models/model_abilities/audio.rst | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/doc/source/models/model_abilities/audio.rst b/doc/source/models/model_abilities/audio.rst
index d6731913d8..368b45e2a1 100644
--- a/doc/source/models/model_abilities/audio.rst
+++ b/doc/source/models/model_abilities/audio.rst
@@ -19,7 +19,7 @@ The Audio API provides three methods for interacting with audio:
 * The speech endpoint generates audio from the input text.
 
 
-.. list-table:: 
+.. list-table::
    :widths: 25  50
    :header-rows: 1
 
@@ -91,7 +91,7 @@ We can try Transcription API out either via cURL, OpenAI Client, or Xinference's
     import openai
 
     client = openai.Client(
-        api_key="cannot be empty", 
+        api_key="cannot be empty",
         base_url="http://<XINFERENCE_HOST>:<XINFERENCE_PORT>/v1"
     )
     with open("speech.mp3", "rb") as audio_file:
@@ -270,6 +270,7 @@ CosyVoice Usage
 ~~~~~~~~~~~~~~~
 
 Basic usage, launch model ``CosyVoice-300M-SFT``.
+PS. If you have other persistent ``.pt`` files for cloned voices, you can set the path of the folder containing the ``.pt`` files in the environment variable ``SOCYVOICE_PT_PATH``.
 
 .. tabs::
 
@@ -282,7 +283,7 @@ Basic usage, launch model ``CosyVoice-300M-SFT``.
       -d '{
         "model": "<MODEL_UID>",
         "input": "<The text to generate audio for>",
-        # ['中文女', '中文男', '日语男', '粤语女', '英文女', '英文男', '韩语女']
+        # ['中文女', '中文男', '日语男', '粤语女', '英文女', '英文男', '韩语女', other voice you put in ``SOCYVOICE_PT_PATH``]
         "voice": "中文女"
       }'
 
@@ -297,7 +298,7 @@ Basic usage, launch model ``CosyVoice-300M-SFT``.
     response = client.audio.speech.create(
         model=<MODEL_UID>,
         input=<The text to generate audio for>,
-        # ['中文女', '中文男', '日语男', '粤语女', '英文女', '英文男', '韩语女']
+        # ['中文女', '中文男', '日语男', '粤语女', '英文女', '英文男', '韩语女', other voice you put in ``SOCYVOICE_PT_PATH``]
         voice="中文女",
     )
     response.stream_to_file('1.mp3')
@@ -311,7 +312,7 @@ Basic usage, launch model ``CosyVoice-300M-SFT``.
     model = client.get_model("<MODEL_UID>")
     speech_bytes = model.speech(
         input=<The text to generate audio for>,
-        # ['中文女', '中文男', '日语男', '粤语女', '英文女', '英文男', '韩语女']
+        # ['中文女', '中文男', '日语男', '粤语女', '英文女', '英文男', '韩语女', other voice you put in ``SOCYVOICE_PT_PATH``]
         voice="中文女"
     )
     with open('1.mp3', 'wb') as f: