From d3428697115cc4666b38b32925ba28bdc1a21957 Mon Sep 17 00:00:00 2001 From: Xuye Qin Date: Fri, 27 Dec 2024 18:14:37 +0800 Subject: [PATCH] FEAT: support SD3.5 series model (#2706) --- doc/source/gen_docs.py | 1 + .../models/model_abilities/image.po | 247 +++++++++++++++--- .../models/builtin/audio/cosyvoice2-0.5b.rst | 19 ++ .../models/builtin/audio/f5-tts-mlx.rst | 19 ++ .../models/builtin/audio/fishspeech-1.5.rst | 19 ++ doc/source/models/builtin/audio/index.rst | 6 +- .../models/builtin/image/flux.1-dev.rst | 10 +- .../models/builtin/image/flux.1-schnell.rst | 10 +- .../models/builtin/image/got-ocr2_0.rst | 3 +- doc/source/models/builtin/image/index.rst | 6 + doc/source/models/builtin/image/kolors.rst | 3 +- doc/source/models/builtin/image/sd-turbo.rst | 3 +- .../models/builtin/image/sd3-medium.rst | 3 +- .../builtin/image/sd3.5-large-turbo.rst | 27 ++ .../models/builtin/image/sd3.5-large.rst | 27 ++ .../models/builtin/image/sd3.5-medium.rst | 27 ++ .../models/builtin/image/sdxl-turbo.rst | 3 +- .../image/stable-diffusion-2-inpainting.rst | 3 +- .../image/stable-diffusion-inpainting.rst | 3 +- .../builtin/image/stable-diffusion-v1.5.rst | 3 +- .../image/stable-diffusion-xl-base-1.0.rst | 3 +- .../image/stable-diffusion-xl-inpainting.rst | 3 +- doc/source/models/model_abilities/image.rst | 108 ++++++-- doc/templates/image.rst.jinja | 12 +- setup.cfg | 11 +- xinference/core/utils.py | 16 +- xinference/deploy/docker/requirements.txt | 4 +- xinference/deploy/docker/requirements_cpu.txt | 4 +- xinference/model/image/core.py | 70 ++++- xinference/model/image/model_spec.json | 131 +++++++++- .../model/image/model_spec_modelscope.json | 134 +++++++++- .../model/image/stable_diffusion/core.py | 58 +++- 32 files changed, 884 insertions(+), 112 deletions(-) create mode 100644 doc/source/models/builtin/audio/cosyvoice2-0.5b.rst create mode 100644 doc/source/models/builtin/audio/f5-tts-mlx.rst create mode 100644 doc/source/models/builtin/audio/fishspeech-1.5.rst create mode 100644 doc/source/models/builtin/image/sd3.5-large-turbo.rst create mode 100644 doc/source/models/builtin/image/sd3.5-large.rst create mode 100644 doc/source/models/builtin/image/sd3.5-medium.rst diff --git a/doc/source/gen_docs.py b/doc/source/gen_docs.py index 68e381edf6..380d57bb8b 100644 --- a/doc/source/gen_docs.py +++ b/doc/source/gen_docs.py @@ -203,6 +203,7 @@ def get_unique_id(spec): available_controlnet = None model["available_controlnet"] = available_controlnet model["model_ability"] = ', '.join(model.get("model_ability")) + model["gguf_quantizations"] = ", ".join(model.get("gguf_quantizations", [])) rendered = env.get_template('image.rst.jinja').render(model) output_file_path = os.path.join(output_dir, f"{model['model_name'].lower()}.rst") with open(output_file_path, 'w') as output_file: diff --git a/doc/source/locale/zh_CN/LC_MESSAGES/models/model_abilities/image.po b/doc/source/locale/zh_CN/LC_MESSAGES/models/model_abilities/image.po index e73ba213b0..9c61d12faa 100644 --- a/doc/source/locale/zh_CN/LC_MESSAGES/models/model_abilities/image.po +++ b/doc/source/locale/zh_CN/LC_MESSAGES/models/model_abilities/image.po @@ -8,7 +8,7 @@ msgid "" msgstr "" "Project-Id-Version: Xinference \n" "Report-Msgid-Bugs-To: \n" -"POT-Creation-Date: 2024-10-30 07:49+0000\n" +"POT-Creation-Date: 2024-12-26 18:49+0800\n" "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" "Last-Translator: FULL NAME \n" "Language: zh_CN\n" @@ -17,7 +17,7 @@ msgstr "" "MIME-Version: 1.0\n" "Content-Type: text/plain; charset=utf-8\n" "Content-Transfer-Encoding: 8bit\n" -"Generated-By: Babel 2.16.0\n" +"Generated-By: Babel 2.14.0\n" #: ../../source/models/model_abilities/image.rst:5 msgid "Images" @@ -98,26 +98,48 @@ msgid "stable-diffusion-xl-base-1.0" msgstr "" #: ../../source/models/model_abilities/image.rst:43 +#: ../../source/models/model_abilities/image.rst:149 msgid "sd3-medium" msgstr "" #: ../../source/models/model_abilities/image.rst:44 -msgid "FLUX.1-schnell" +#: ../../source/models/model_abilities/image.rst:151 +#: ../../source/models/model_abilities/image.rst:180 +msgid "sd3.5-medium" msgstr "" #: ../../source/models/model_abilities/image.rst:45 +#: ../../source/models/model_abilities/image.rst:153 +#: ../../source/models/model_abilities/image.rst:182 +msgid "sd3.5-large" +msgstr "" + +#: ../../source/models/model_abilities/image.rst:46 +#: ../../source/models/model_abilities/image.rst:155 +msgid "sd3.5-large-turbo" +msgstr "" + +#: ../../source/models/model_abilities/image.rst:47 +#: ../../source/models/model_abilities/image.rst:147 +#: ../../source/models/model_abilities/image.rst:178 +msgid "FLUX.1-schnell" +msgstr "" + +#: ../../source/models/model_abilities/image.rst:48 +#: ../../source/models/model_abilities/image.rst:145 +#: ../../source/models/model_abilities/image.rst:176 msgid "FLUX.1-dev" msgstr "" -#: ../../source/models/model_abilities/image.rst:49 +#: ../../source/models/model_abilities/image.rst:52 msgid "Quickstart" msgstr "快速入门" -#: ../../source/models/model_abilities/image.rst:52 +#: ../../source/models/model_abilities/image.rst:55 msgid "Text-to-image" msgstr "文生图" -#: ../../source/models/model_abilities/image.rst:54 +#: ../../source/models/model_abilities/image.rst:57 msgid "" "The Text-to-image API mimics OpenAI's `create images API " "`_. We can " @@ -127,15 +149,26 @@ msgstr "" "可以通过 cURL、OpenAI Client 或 Xinference 的方式尝试使用 Text-to-image " "API。" -#: ../../source/models/model_abilities/image.rst:109 -msgid "Tips for Large Image Models including SD3-Medium, FLUX.1" -msgstr "大型图像模型部署(sd3-medium、FLUX.1 系列)贴士" +#: ../../source/models/model_abilities/image.rst:112 +msgid "Quantize Large Image Models e.g. SD3-Medium, FLUX.1" +msgstr "量化大型图像模型(sd3-medium、FLUX.1 系列等)" -#: ../../source/models/model_abilities/image.rst:111 +#: ../../source/models/model_abilities/image.rst:116 +msgid "" +"From v0.16.1, Xinference by default enabled quantization for large image " +"models like Flux.1 and SD3.5 series. So if your Xinference version is " +"newer than v0.16.1, You barely need to do anything to run those large " +"image models on GPUs with small memory." +msgstr "" +"从 v0.16.1 开始,Xinference 默认对大图像模型如 Flux.1 和 SD3.5 系列开启" +"量化。如果你使用新于 v0.16.1 的 Xinference 版本,你不需要做什么事情来在小" +" GPU 显存的机器上来运行这些大型图像模型。" + +#: ../../source/models/model_abilities/image.rst:121 msgid "Useful extra parameters can be passed to launch including:" msgstr "有用的传递给加载模型的额外参数包括:" -#: ../../source/models/model_abilities/image.rst:113 +#: ../../source/models/model_abilities/image.rst:123 msgid "" "``--cpu_offload True``: specifying ``True`` will offload the components " "of the model to CPU during inference in order to save memory, while " @@ -147,7 +180,7 @@ msgstr "" "CPU 上以节省内存,这会导致推理延迟略有增加。模型卸载仅会在需要执行时将" "模型组件移动到 GPU 上,同时保持其余组件在 CPU 上" -#: ../../source/models/model_abilities/image.rst:117 +#: ../../source/models/model_abilities/image.rst:127 msgid "" "``--quantize_text_encoder ``: We leveraged the " "``bitsandbytes`` library to load and quantize the T5-XXL text encoder to " @@ -158,7 +191,7 @@ msgstr "" "`` 库加载并量化 T5-XXL 文本编码器至8位精度。这使得你能够在仅轻微影响性能" "的情况下继续使用全部文本编码器。" -#: ../../source/models/model_abilities/image.rst:120 +#: ../../source/models/model_abilities/image.rst:130 msgid "" "``--text_encoder_3 None``, for sd3-medium, removing the memory-intensive " "4.7B parameter T5-XXL text encoder during inference can significantly " @@ -167,53 +200,195 @@ msgstr "" "``--text_encoder_3 None``,对于 sd3-medium,移除在推理过程中内存密集型的" "47亿参数T5-XXL文本编码器可以显著降低内存需求,而仅造成性能上的轻微损失。" -#: ../../source/models/model_abilities/image.rst:124 +#: ../../source/models/model_abilities/image.rst:133 +msgid "``--transformer_nf4 True``: use nf4 for transformer quantization." +msgstr "``--transformer_nf4 True`` :使用 nf4 量化 transformer。" + +#: ../../source/models/model_abilities/image.rst:134 msgid "" -"If you are trying to run large image models liek sd3-medium or FLUX.1 " -"series on GPU card that has less memory than 24GB, you may encounter OOM " -"when launching or inference. Try below solutions." +"``--quantize``: Only work for MLX on Mac, Flux.1-dev and Flux.1-schnell " +"will switch to MLX engine on Mac, and ``quantize`` can be used to " +"quantize the model." msgstr "" -"如果你试图在显存小于24GB的GPU上运行像sd3-medium或FLUX.1系列这样的大型图像" -"模型,你在启动或推理过程中可能会遇到显存溢出(OOM)的问题。尝试以下" -"解决方案。" +"``--quantize`` :只对 Mac 上的 MLX 引擎生效,Flux.1-dev 和 Flux.1-schnell" +"会在 Mac 上使用 MLX 引擎计算,``quantize`` 可以用来量化模型。" -#: ../../source/models/model_abilities/image.rst:128 -msgid "For FLUX.1 series, try to apply quantization." -msgstr "对于 FLUX.1 系列,尝试应用量化。" +#: ../../source/models/model_abilities/image.rst:137 +msgid "" +"For WebUI, Just add additional parameters, e.g. add key ``cpu_offload`` " +"and value ``True`` to enable cpu offloading." +msgstr "" +"对于 WebUI,只需要添加额外参数,比如,添加 key ``cpu_offload`` 以及值 ``" +"True`` 来开启 CPU 卸载。" -#: ../../source/models/model_abilities/image.rst:134 -msgid "For sd3-medium, apply quantization to ``text_encoder_3``." -msgstr "对于 sd3-medium 模型,对 ``text_encoder_3`` 应用量化。" +#: ../../source/models/model_abilities/image.rst:140 +msgid "Below list default options that used from v0.16.1." +msgstr "如下列出了从 v0.16.1 开始默认使用的参数。" + +#: ../../source/models/model_abilities/image.rst:143 +#: ../../source/models/model_abilities/image.rst:174 +msgid "Model" +msgstr "模型" + +#: ../../source/models/model_abilities/image.rst:143 +msgid "quantize_text_encoder" +msgstr "" -#: ../../source/models/model_abilities/image.rst:141 -msgid "Or removing memory-intensive T5-XXL text encoder for sd3-medium." -msgstr "或者,移除 sd3-medium 模型中内存密集型的 T5-XXL 文本编码器。" +#: ../../source/models/model_abilities/image.rst:143 +msgid "quantize" +msgstr "" + +#: ../../source/models/model_abilities/image.rst:143 +msgid "transformer_nf4" +msgstr "" + +#: ../../source/models/model_abilities/image.rst:145 +#: ../../source/models/model_abilities/image.rst:147 +msgid "text_encoder_2" +msgstr "" + +#: ../../source/models/model_abilities/image.rst:145 +#: ../../source/models/model_abilities/image.rst:147 +#: ../../source/models/model_abilities/image.rst:153 +#: ../../source/models/model_abilities/image.rst:155 +msgid "True" +msgstr "" -#: ../../source/models/model_abilities/image.rst:148 +#: ../../source/models/model_abilities/image.rst:145 +#: ../../source/models/model_abilities/image.rst:147 +#: ../../source/models/model_abilities/image.rst:149 +#: ../../source/models/model_abilities/image.rst:151 +msgid "False" +msgstr "" + +#: ../../source/models/model_abilities/image.rst:149 +#: ../../source/models/model_abilities/image.rst:151 +#: ../../source/models/model_abilities/image.rst:153 +#: ../../source/models/model_abilities/image.rst:155 +msgid "text_encoder_3" +msgstr "" + +#: ../../source/models/model_abilities/image.rst:149 +#: ../../source/models/model_abilities/image.rst:151 +#: ../../source/models/model_abilities/image.rst:153 +#: ../../source/models/model_abilities/image.rst:155 +msgid "N/A" +msgstr "" + +#: ../../source/models/model_abilities/image.rst:160 +msgid "" +"If you want to disable some quantization, just set the corresponding " +"option to False. e.g. for Web UI, set key ``quantize_text_encoder`` and " +"value ``False`` and for command line, specify ``--quantize_text_encoder " +"False`` to disable quantization for text encoder." +msgstr "" +"如果你想关闭某些量化,只需要设置相应的选项为 False。比如,对于 Web UI," +"设置 key ``quantize_text_encoder`` 和值 ``False``,或对于命令行,指定 ``" +"--quantize_text_encoder False`` 来关闭 text encoder 的量化。" + +#: ../../source/models/model_abilities/image.rst:166 +msgid "GGUF file format" +msgstr "GGUF 文件格式" + +#: ../../source/models/model_abilities/image.rst:168 +msgid "" +"GGUF file format for transformer provides various quantization options. " +"To use gguf file, you can specify additional option ``gguf_quantization``" +" for web UI, or ``--gguf_quantization`` for command line for those image " +"models which support internally by Xinference. Below is the mode list." +msgstr "" +"GGUF 文件格式为 transformer 模块提供了丰富的量化选项。要使用 GGUF 文件," +"你可以在 Web 界面上指定额外选项 ``gguf_quantization`` ,或者在命令行指定 " +"``--gguf_quantization`` ,以为 Xinference 内建支持 GGUF 量化的模型开启。" +"如下是内置支持的模型。" + +#: ../../source/models/model_abilities/image.rst:174 +msgid "supported gguf quantization" +msgstr "支持 GGUF 量化格式" + +#: ../../source/models/model_abilities/image.rst:176 +#: ../../source/models/model_abilities/image.rst:178 +msgid "F16, Q2_K, Q3_K_S, Q4_0, Q4_1, Q4_K_S, Q5_0, Q5_1, Q5_K_S, Q6_K, Q8_0" +msgstr "" + +#: ../../source/models/model_abilities/image.rst:187 +msgid "" +"We stronly recommend to enable additional option ``cpu_offload`` with " +"value ``True`` for WebUI, or specify ``--cpu_offload True`` for command " +"line." +msgstr "" +"我们强烈推荐在 WebUI 上开启额外选项 ``cpu_offload`` 并指定为 ``True``,或" +"对命令行,指定 ``--cpu_offload True``。" + +#: ../../source/models/model_abilities/image.rst:190 +msgid "Example:" +msgstr "例如:" + +#: ../../source/models/model_abilities/image.rst:196 +msgid "" +"With ``Q2_K`` quantization, you only need around 5 GiB GPU memory to run " +"Flux.1-dev." +msgstr "" +"使用 ``Q2_K`` 量化,你只需要大约 5GB 的显存来运行 Flux.1-dev。" + +#: ../../source/models/model_abilities/image.rst:198 +msgid "" +"For those models gguf options are not supported internally, or you want " +"to download gguf files on you own, you can specify additional option " +"``gguf_model_path`` for web UI or spcecify ``--gguf_model_path " +"/path/to/model_quant.gguf`` for command line." +msgstr "" +"对于非内建支持 GGUF 量化的模型,或者你希望自己下载 GGUF 文件,你可以在 " +"Web UI 指定额外选项 ``gguf_model_path`` 或者用命令行指定 ``--gguf_model_" +"path /path/to/model_quant.gguf`` 。" + +#: ../../source/models/model_abilities/image.rst:204 msgid "Image-to-image" msgstr "图生图" -#: ../../source/models/model_abilities/image.rst:150 +#: ../../source/models/model_abilities/image.rst:206 msgid "You can find more examples of Images API in the tutorial notebook:" msgstr "你可以在教程笔记本中找到更多 Images API 的示例。" -#: ../../source/models/model_abilities/image.rst:154 +#: ../../source/models/model_abilities/image.rst:210 msgid "Stable Diffusion ControlNet" msgstr "" -#: ../../source/models/model_abilities/image.rst:157 +#: ../../source/models/model_abilities/image.rst:213 msgid "Learn from a Stable Diffusion ControlNet example" msgstr "学习一个 Stable Diffusion 控制网络的示例" -#: ../../source/models/model_abilities/image.rst:160 +#: ../../source/models/model_abilities/image.rst:216 msgid "OCR" msgstr "" -#: ../../source/models/model_abilities/image.rst:162 +#: ../../source/models/model_abilities/image.rst:218 msgid "The OCR API accepts image bytes and returns the OCR text." msgstr "OCR API 接受图像字节并返回 OCR 文本。" -#: ../../source/models/model_abilities/image.rst:164 +#: ../../source/models/model_abilities/image.rst:220 msgid "We can try OCR API out either via cURL, or Xinference's python client:" msgstr "可以通过 cURL 或 Xinference 的 Python 客户端来尝试 OCR API。" +#~ msgid "" +#~ "If you are trying to run large " +#~ "image models liek sd3-medium or FLUX.1" +#~ " series on GPU card that has " +#~ "less memory than 24GB, you may " +#~ "encounter OOM when launching or " +#~ "inference. Try below solutions." +#~ msgstr "" +#~ "如果你试图在显存小于24GB的GPU上运行像" +#~ "sd3-medium或FLUX.1系列这样的大型图像模型" +#~ ",你在启动或推理过程中可能会遇到显存" +#~ "溢出(OOM)的问题。尝试以下解决方案。" + +#~ msgid "For FLUX.1 series, try to apply quantization." +#~ msgstr "对于 FLUX.1 系列,尝试应用量化。" + +#~ msgid "For sd3-medium, apply quantization to ``text_encoder_3``." +#~ msgstr "对于 sd3-medium 模型,对 ``text_encoder_3`` 应用量化。" + +#~ msgid "Or removing memory-intensive T5-XXL text encoder for sd3-medium." +#~ msgstr "或者,移除 sd3-medium 模型中内存密集型的 T5-XXL 文本编码器。" + diff --git a/doc/source/models/builtin/audio/cosyvoice2-0.5b.rst b/doc/source/models/builtin/audio/cosyvoice2-0.5b.rst new file mode 100644 index 0000000000..781da78852 --- /dev/null +++ b/doc/source/models/builtin/audio/cosyvoice2-0.5b.rst @@ -0,0 +1,19 @@ +.. _models_builtin_cosyvoice2-0.5b: + +=============== +CosyVoice2-0.5B +=============== + +- **Model Name:** CosyVoice2-0.5B +- **Model Family:** CosyVoice +- **Abilities:** text-to-audio +- **Multilingual:** True + +Specifications +^^^^^^^^^^^^^^ + +- **Model ID:** mrfakename/CosyVoice2-0.5B + +Execute the following command to launch the model:: + + xinference launch --model-name CosyVoice2-0.5B --model-type audio \ No newline at end of file diff --git a/doc/source/models/builtin/audio/f5-tts-mlx.rst b/doc/source/models/builtin/audio/f5-tts-mlx.rst new file mode 100644 index 0000000000..7ef102eb19 --- /dev/null +++ b/doc/source/models/builtin/audio/f5-tts-mlx.rst @@ -0,0 +1,19 @@ +.. _models_builtin_f5-tts-mlx: + +========== +F5-TTS-MLX +========== + +- **Model Name:** F5-TTS-MLX +- **Model Family:** F5-TTS-MLX +- **Abilities:** text-to-audio +- **Multilingual:** True + +Specifications +^^^^^^^^^^^^^^ + +- **Model ID:** lucasnewman/f5-tts-mlx + +Execute the following command to launch the model:: + + xinference launch --model-name F5-TTS-MLX --model-type audio \ No newline at end of file diff --git a/doc/source/models/builtin/audio/fishspeech-1.5.rst b/doc/source/models/builtin/audio/fishspeech-1.5.rst new file mode 100644 index 0000000000..83b0b7d001 --- /dev/null +++ b/doc/source/models/builtin/audio/fishspeech-1.5.rst @@ -0,0 +1,19 @@ +.. _models_builtin_fishspeech-1.5: + +============== +FishSpeech-1.5 +============== + +- **Model Name:** FishSpeech-1.5 +- **Model Family:** FishAudio +- **Abilities:** text-to-audio +- **Multilingual:** True + +Specifications +^^^^^^^^^^^^^^ + +- **Model ID:** fishaudio/fish-speech-1.5 + +Execute the following command to launch the model:: + + xinference launch --model-name FishSpeech-1.5 --model-type audio \ No newline at end of file diff --git a/doc/source/models/builtin/audio/index.rst b/doc/source/models/builtin/audio/index.rst index 0936a93c02..f01047e13c 100644 --- a/doc/source/models/builtin/audio/index.rst +++ b/doc/source/models/builtin/audio/index.rst @@ -25,9 +25,13 @@ The following is a list of built-in audio models in Xinference: cosyvoice-300m-sft + cosyvoice2-0.5b + f5-tts - fishspeech-1.4 + f5-tts-mlx + + fishspeech-1.5 sensevoicesmall diff --git a/doc/source/models/builtin/image/flux.1-dev.rst b/doc/source/models/builtin/image/flux.1-dev.rst index 3a16cfe0a7..5018412d86 100644 --- a/doc/source/models/builtin/image/flux.1-dev.rst +++ b/doc/source/models/builtin/image/flux.1-dev.rst @@ -13,7 +13,15 @@ Specifications ^^^^^^^^^^^^^^ - **Model ID:** black-forest-labs/FLUX.1-dev +- **GGUF Model ID**: city96/FLUX.1-dev-gguf +- **GGUF Quantizations**: F16, Q2_K, Q3_K_S, Q4_0, Q4_1, Q4_K_S, Q5_0, Q5_1, Q5_K_S, Q6_K, Q8_0 + Execute the following command to launch the model:: - xinference launch --model-name FLUX.1-dev --model-type image \ No newline at end of file + xinference launch --model-name FLUX.1-dev --model-type image + + +For GGUF quantization, using below command: + + xinference launch --model-name FLUX.1-dev --model-type image --gguf_quantization ${gguf_quantization} --cpu_offload True diff --git a/doc/source/models/builtin/image/flux.1-schnell.rst b/doc/source/models/builtin/image/flux.1-schnell.rst index df82d2069f..47bc1e2411 100644 --- a/doc/source/models/builtin/image/flux.1-schnell.rst +++ b/doc/source/models/builtin/image/flux.1-schnell.rst @@ -13,7 +13,15 @@ Specifications ^^^^^^^^^^^^^^ - **Model ID:** black-forest-labs/FLUX.1-schnell +- **GGUF Model ID**: city96/FLUX.1-schnell-gguf +- **GGUF Quantizations**: F16, Q2_K, Q3_K_S, Q4_0, Q4_1, Q4_K_S, Q5_0, Q5_1, Q5_K_S, Q6_K, Q8_0 + Execute the following command to launch the model:: - xinference launch --model-name FLUX.1-schnell --model-type image \ No newline at end of file + xinference launch --model-name FLUX.1-schnell --model-type image + + +For GGUF quantization, using below command: + + xinference launch --model-name FLUX.1-schnell --model-type image --gguf_quantization ${gguf_quantization} --cpu_offload True diff --git a/doc/source/models/builtin/image/got-ocr2_0.rst b/doc/source/models/builtin/image/got-ocr2_0.rst index 994b0deae4..b1e9b1d5d2 100644 --- a/doc/source/models/builtin/image/got-ocr2_0.rst +++ b/doc/source/models/builtin/image/got-ocr2_0.rst @@ -16,4 +16,5 @@ Specifications Execute the following command to launch the model:: - xinference launch --model-name GOT-OCR2_0 --model-type image \ No newline at end of file + xinference launch --model-name GOT-OCR2_0 --model-type image + diff --git a/doc/source/models/builtin/image/index.rst b/doc/source/models/builtin/image/index.rst index bf4efdab86..dacc0cbe38 100644 --- a/doc/source/models/builtin/image/index.rst +++ b/doc/source/models/builtin/image/index.rst @@ -23,6 +23,12 @@ The following is a list of built-in image models in Xinference: sd3-medium + sd3.5-large + + sd3.5-large-turbo + + sd3.5-medium + sdxl-turbo stable-diffusion-2-inpainting diff --git a/doc/source/models/builtin/image/kolors.rst b/doc/source/models/builtin/image/kolors.rst index 19d11c4201..886de8ff02 100644 --- a/doc/source/models/builtin/image/kolors.rst +++ b/doc/source/models/builtin/image/kolors.rst @@ -16,4 +16,5 @@ Specifications Execute the following command to launch the model:: - xinference launch --model-name kolors --model-type image \ No newline at end of file + xinference launch --model-name kolors --model-type image + diff --git a/doc/source/models/builtin/image/sd-turbo.rst b/doc/source/models/builtin/image/sd-turbo.rst index e799b423a3..c78ebaef91 100644 --- a/doc/source/models/builtin/image/sd-turbo.rst +++ b/doc/source/models/builtin/image/sd-turbo.rst @@ -16,4 +16,5 @@ Specifications Execute the following command to launch the model:: - xinference launch --model-name sd-turbo --model-type image \ No newline at end of file + xinference launch --model-name sd-turbo --model-type image + diff --git a/doc/source/models/builtin/image/sd3-medium.rst b/doc/source/models/builtin/image/sd3-medium.rst index 953a3eca32..f0e728cc02 100644 --- a/doc/source/models/builtin/image/sd3-medium.rst +++ b/doc/source/models/builtin/image/sd3-medium.rst @@ -16,4 +16,5 @@ Specifications Execute the following command to launch the model:: - xinference launch --model-name sd3-medium --model-type image \ No newline at end of file + xinference launch --model-name sd3-medium --model-type image + diff --git a/doc/source/models/builtin/image/sd3.5-large-turbo.rst b/doc/source/models/builtin/image/sd3.5-large-turbo.rst new file mode 100644 index 0000000000..3c7e26c2b0 --- /dev/null +++ b/doc/source/models/builtin/image/sd3.5-large-turbo.rst @@ -0,0 +1,27 @@ +.. _models_builtin_sd3.5-large-turbo: + +================= +sd3.5-large-turbo +================= + +- **Model Name:** sd3.5-large-turbo +- **Model Family:** stable_diffusion +- **Abilities:** text2image, image2image, inpainting +- **Available ControlNet:** None + +Specifications +^^^^^^^^^^^^^^ + +- **Model ID:** stabilityai/stable-diffusion-3.5-large-turbo +- **GGUF Model ID**: city96/stable-diffusion-3.5-large-turbo-gguf +- **GGUF Quantizations**: F16, Q4_0, Q4_1, Q5_0, Q5_1, Q8_0 + + +Execute the following command to launch the model:: + + xinference launch --model-name sd3.5-large-turbo --model-type image + + +For GGUF quantization, using below command: + + xinference launch --model-name sd3.5-large-turbo --model-type image --gguf_quantization ${gguf_quantization} --cpu_offload True diff --git a/doc/source/models/builtin/image/sd3.5-large.rst b/doc/source/models/builtin/image/sd3.5-large.rst new file mode 100644 index 0000000000..d998fc4908 --- /dev/null +++ b/doc/source/models/builtin/image/sd3.5-large.rst @@ -0,0 +1,27 @@ +.. _models_builtin_sd3.5-large: + +=========== +sd3.5-large +=========== + +- **Model Name:** sd3.5-large +- **Model Family:** stable_diffusion +- **Abilities:** text2image, image2image, inpainting +- **Available ControlNet:** None + +Specifications +^^^^^^^^^^^^^^ + +- **Model ID:** stabilityai/stable-diffusion-3.5-large +- **GGUF Model ID**: city96/stable-diffusion-3.5-large-gguf +- **GGUF Quantizations**: F16, Q4_0, Q4_1, Q5_0, Q5_1, Q8_0 + + +Execute the following command to launch the model:: + + xinference launch --model-name sd3.5-large --model-type image + + +For GGUF quantization, using below command: + + xinference launch --model-name sd3.5-large --model-type image --gguf_quantization ${gguf_quantization} --cpu_offload True diff --git a/doc/source/models/builtin/image/sd3.5-medium.rst b/doc/source/models/builtin/image/sd3.5-medium.rst new file mode 100644 index 0000000000..42a65f7e47 --- /dev/null +++ b/doc/source/models/builtin/image/sd3.5-medium.rst @@ -0,0 +1,27 @@ +.. _models_builtin_sd3.5-medium: + +============ +sd3.5-medium +============ + +- **Model Name:** sd3.5-medium +- **Model Family:** stable_diffusion +- **Abilities:** text2image, image2image, inpainting +- **Available ControlNet:** None + +Specifications +^^^^^^^^^^^^^^ + +- **Model ID:** stabilityai/stable-diffusion-3.5-medium +- **GGUF Model ID**: city96/stable-diffusion-3.5-medium-gguf +- **GGUF Quantizations**: F16, Q3_K_M, Q3_K_S, Q4_0, Q4_1, Q4_K_M, Q4_K_S, Q5_0, Q5_1, Q5_K_M, Q5_K_S, Q6_K, Q8_0 + + +Execute the following command to launch the model:: + + xinference launch --model-name sd3.5-medium --model-type image + + +For GGUF quantization, using below command: + + xinference launch --model-name sd3.5-medium --model-type image --gguf_quantization ${gguf_quantization} --cpu_offload True diff --git a/doc/source/models/builtin/image/sdxl-turbo.rst b/doc/source/models/builtin/image/sdxl-turbo.rst index 878b10079f..9da2b63bf2 100644 --- a/doc/source/models/builtin/image/sdxl-turbo.rst +++ b/doc/source/models/builtin/image/sdxl-turbo.rst @@ -16,4 +16,5 @@ Specifications Execute the following command to launch the model:: - xinference launch --model-name sdxl-turbo --model-type image \ No newline at end of file + xinference launch --model-name sdxl-turbo --model-type image + diff --git a/doc/source/models/builtin/image/stable-diffusion-2-inpainting.rst b/doc/source/models/builtin/image/stable-diffusion-2-inpainting.rst index 6009cd37a3..01a84a63db 100644 --- a/doc/source/models/builtin/image/stable-diffusion-2-inpainting.rst +++ b/doc/source/models/builtin/image/stable-diffusion-2-inpainting.rst @@ -16,4 +16,5 @@ Specifications Execute the following command to launch the model:: - xinference launch --model-name stable-diffusion-2-inpainting --model-type image \ No newline at end of file + xinference launch --model-name stable-diffusion-2-inpainting --model-type image + diff --git a/doc/source/models/builtin/image/stable-diffusion-inpainting.rst b/doc/source/models/builtin/image/stable-diffusion-inpainting.rst index 76f1274048..3b4832471c 100644 --- a/doc/source/models/builtin/image/stable-diffusion-inpainting.rst +++ b/doc/source/models/builtin/image/stable-diffusion-inpainting.rst @@ -16,4 +16,5 @@ Specifications Execute the following command to launch the model:: - xinference launch --model-name stable-diffusion-inpainting --model-type image \ No newline at end of file + xinference launch --model-name stable-diffusion-inpainting --model-type image + diff --git a/doc/source/models/builtin/image/stable-diffusion-v1.5.rst b/doc/source/models/builtin/image/stable-diffusion-v1.5.rst index 5a0c73adcd..764bbb0a4c 100644 --- a/doc/source/models/builtin/image/stable-diffusion-v1.5.rst +++ b/doc/source/models/builtin/image/stable-diffusion-v1.5.rst @@ -16,4 +16,5 @@ Specifications Execute the following command to launch the model:: - xinference launch --model-name stable-diffusion-v1.5 --model-type image \ No newline at end of file + xinference launch --model-name stable-diffusion-v1.5 --model-type image + diff --git a/doc/source/models/builtin/image/stable-diffusion-xl-base-1.0.rst b/doc/source/models/builtin/image/stable-diffusion-xl-base-1.0.rst index a4f7518dbf..cfb515be55 100644 --- a/doc/source/models/builtin/image/stable-diffusion-xl-base-1.0.rst +++ b/doc/source/models/builtin/image/stable-diffusion-xl-base-1.0.rst @@ -16,4 +16,5 @@ Specifications Execute the following command to launch the model:: - xinference launch --model-name stable-diffusion-xl-base-1.0 --model-type image \ No newline at end of file + xinference launch --model-name stable-diffusion-xl-base-1.0 --model-type image + diff --git a/doc/source/models/builtin/image/stable-diffusion-xl-inpainting.rst b/doc/source/models/builtin/image/stable-diffusion-xl-inpainting.rst index 61a72cc044..f096627d51 100644 --- a/doc/source/models/builtin/image/stable-diffusion-xl-inpainting.rst +++ b/doc/source/models/builtin/image/stable-diffusion-xl-inpainting.rst @@ -16,4 +16,5 @@ Specifications Execute the following command to launch the model:: - xinference launch --model-name stable-diffusion-xl-inpainting --model-type image \ No newline at end of file + xinference launch --model-name stable-diffusion-xl-inpainting --model-type image + diff --git a/doc/source/models/model_abilities/image.rst b/doc/source/models/model_abilities/image.rst index 79834e0dca..e49971a85b 100644 --- a/doc/source/models/model_abilities/image.rst +++ b/doc/source/models/model_abilities/image.rst @@ -1,4 +1,4 @@ -.. _image: + .. _image: ====== Images @@ -41,6 +41,9 @@ The Text-to-image API is supported with the following models in Xinference: * stable-diffusion-v1.5 * stable-diffusion-xl-base-1.0 * sd3-medium +* sd3.5-medium +* sd3.5-large +* sd3.5-large-turbo * FLUX.1-schnell * FLUX.1-dev @@ -105,8 +108,15 @@ We can try Text-to-image API out either via cURL, OpenAI Client, or Xinference's } -Tips for Large Image Models including SD3-Medium, FLUX.1 -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Quantize Large Image Models e.g. SD3-Medium, FLUX.1 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. note:: + + From v0.16.1, Xinference by default enabled quantization for + large image models like Flux.1 and SD3.5 series. + So if your Xinference version is newer than v0.16.1, + You barely need to do anything to run those large image models on GPUs with small memory. Useful extra parameters can be passed to launch including: @@ -120,29 +130,77 @@ Useful extra parameters can be passed to launch including: * ``--text_encoder_3 None``, for sd3-medium, removing the memory-intensive 4.7B parameter T5-XXL text encoder during inference can significantly decrease the memory requirements with only a slight loss in performance. +* ``--transformer_nf4 True``: use nf4 for transformer quantization. +* ``--quantize``: Only work for MLX on Mac, Flux.1-dev and Flux.1-schnell will switch to + MLX engine on Mac, and ``quantize`` can be used to quantize the model. + +For WebUI, Just add additional parameters, e.g. add key ``cpu_offload`` and value ``True`` +to enable cpu offloading. + +Below list default options that used from v0.16.1. + ++-------------------+-----------------------+----------------------+------------------+ +| Model | quantize_text_encoder | quantize | transformer_nf4 | ++===================+=======================+======================+==================+ +| FLUX.1-dev | text_encoder_2 | True | False | ++-------------------+-----------------------+----------------------+------------------+ +| FLUX.1-schnell | text_encoder_2 | True | False | ++-------------------+-----------------------+----------------------+------------------+ +| sd3-medium | text_encoder_3 | N/A | False | ++-------------------+-----------------------+----------------------+------------------+ +| sd3.5-medium | text_encoder_3 | N/A | False | ++-------------------+-----------------------+----------------------+------------------+ +| sd3.5-large | text_encoder_3 | N/A | True | ++-------------------+-----------------------+----------------------+------------------+ +| sd3.5-large-turbo | text_encoder_3 | N/A | True | ++-------------------+-----------------------+----------------------+------------------+ + +.. note:: + + If you want to disable some quantization, just set the corresponding option to False. + e.g. for Web UI, set key ``quantize_text_encoder`` and value ``False`` + and for command line, specify ``--quantize_text_encoder False`` to disable quantization + for text encoder. + +GGUF file format +~~~~~~~~~~~~~~~~ + +GGUF file format for transformer provides various quantization options. +To use gguf file, you can specify additional option ``gguf_quantization`` for web UI, +or ``--gguf_quantization`` for command line for those image models which support +internally by Xinference. Below is the mode list. + ++-------------------+------------------------------------------------------------------------------------------+ +| Model | supported gguf quantization | ++===================+==============================================+===========================================+ +| FLUX.1-dev | F16, Q2_K, Q3_K_S, Q4_0, Q4_1, Q4_K_S, Q5_0, Q5_1, Q5_K_S, Q6_K, Q8_0 | ++-------------------+------------------------------------------------------------------------------------------+ +| FLUX.1-schnell | F16, Q2_K, Q3_K_S, Q4_0, Q4_1, Q4_K_S, Q5_0, Q5_1, Q5_K_S, Q6_K, Q8_0 | ++-------------------+------------------------------------------------------------------------------------------+ +| sd3.5-medium | F16, Q3_K_M, Q3_K_S, Q4_0, Q4_1, Q4_K_M, Q4_K_S, Q5_0, Q5_1, Q5_K_M, Q5_K_S, Q6_K, Q8_0 | ++-------------------+------------------------------------------------------------------------------------------+ +| sd3.5-large | F16, Q4_0, Q4_1, Q5_0, Q5_1, Q8_0 | ++-------------------+------------------------------------------------------------------------------------------+ +| sd3.5-large-turbo | F16, Q4_0, Q4_1, Q5_0, Q5_1, Q8_0 | ++-------------------+------------------------------------------------------------------------------------------+ + +.. note:: + + We stronly recommend to enable additional option ``cpu_offload`` with value ``True`` for WebUI, + or specify ``--cpu_offload True`` for command line. + +Example: + +.. code-block:: + + xinference launch --model-name FLUX.1-dev --model-type image --gguf_quantization Q2_K --cpu_offload True + +With ``Q2_K`` quantization, you only need around 5 GiB GPU memory to run Flux.1-dev. + +For those models gguf options are not supported internally, or you want to download gguf files on you own, +you can specify additional option ``gguf_model_path`` for web UI or spcecify +``--gguf_model_path /path/to/model_quant.gguf`` for command line. -If you are trying to run large image models liek sd3-medium or FLUX.1 series on GPU card -that has less memory than 24GB, you may encounter OOM when launching or inference. -Try below solutions. - -For FLUX.1 series, try to apply quantization. - -.. code:: bash - - xinference launch --model-name FLUX.1-dev --model-type image --quantize_text_encoder text_encoder_2 - -For sd3-medium, apply quantization to ``text_encoder_3``. - -.. code:: bash - - xinference launch --model-name sd3-medium --model-type image --quantize_text_encoder text_encoder_3 - - -Or removing memory-intensive T5-XXL text encoder for sd3-medium. - -.. code:: bash - - xinference launch --model-name sd3-medium --model-type image --text_encoder_3 None Image-to-image -------------------- diff --git a/doc/templates/image.rst.jinja b/doc/templates/image.rst.jinja index 06379d7d56..8e1e57c145 100644 --- a/doc/templates/image.rst.jinja +++ b/doc/templates/image.rst.jinja @@ -13,7 +13,17 @@ Specifications ^^^^^^^^^^^^^^ - **Model ID:** {{ model_id }} +{%- if gguf_quantizations %} +- **GGUF Model ID**: {{ gguf_model_id }} +- **GGUF Quantizations**: {{ gguf_quantizations }} +{% endif %} Execute the following command to launch the model:: - xinference launch --model-name {{ model_name }} --model-type image \ No newline at end of file + xinference launch --model-name {{ model_name }} --model-type image + +{% if gguf_quantizations %} +For GGUF quantization, using below command: + + xinference launch --model-name {{ model_name }} --model-type image --gguf_quantization ${{ '{' }}gguf_quantization{{ '}' }} --cpu_offload True +{% endif %} \ No newline at end of file diff --git a/setup.cfg b/setup.cfg index f21d2431b4..7cafa63ce6 100644 --- a/setup.cfg +++ b/setup.cfg @@ -90,7 +90,6 @@ all = tiktoken>=0.6.0 sentence-transformers>=3.1.0 vllm>=0.2.6 ; sys_platform=='linux' - diffusers>=0.30.0 imageio-ffmpeg # For video controlnet_aux orjson @@ -119,7 +118,8 @@ all = hydra-core>=1.3.2 # For CosyVoice, matcha inflect # For CosyVoice, matcha conformer # For CosyVoice, matcha - diffusers>=0.30.0 # For CosyVoice, matcha + diffusers>=0.32.0 # For CosyVoice, matcha + gguf # For image gdown # For CosyVoice, matcha pyarrow # For CosyVoice, matcha HyperPyYAML # For CosyVoice @@ -193,9 +193,10 @@ embedding = rerank = FlagEmbedding image = - diffusers>=0.30.0 # fix conflict with matcha-tts + diffusers>=0.32.0 # fix conflict with matcha-tts controlnet_aux deepcache + gguf verovio>=4.3.1 # For got_ocr2 transformers>=4.37.2 # For got_ocr2 tiktoken>=0.6.0 # For got_ocr2 @@ -203,7 +204,7 @@ image = torch # For got_ocr2 torchvision # For got_ocr2 video = - diffusers>=0.30.0 + diffusers>=0.32.0 imageio-ffmpeg audio = funasr<1.1.17 @@ -220,7 +221,7 @@ audio = hydra-core>=1.3.2 # For CosyVoice, matcha inflect # For CosyVoice, matcha conformer # For CosyVoice, matcha - diffusers>=0.30.0 # For CosyVoice, matcha + diffusers>=0.32.0 # For CosyVoice, matcha gdown # For CosyVoice, matcha pyarrow # For CosyVoice, matcha HyperPyYAML # For CosyVoice diff --git a/xinference/core/utils.py b/xinference/core/utils.py index 278c570b20..6f9470107c 100644 --- a/xinference/core/utils.py +++ b/xinference/core/utils.py @@ -62,12 +62,16 @@ def decorator(func): @wraps(func) async def wrapped(*args, **kwargs): - try: - bound_args = sig.bind_partial(*args, **kwargs) - arguments = bound_args.arguments - except TypeError: - arguments = {} - request_id_str = arguments.get("request_id", "") + request_id_str = kwargs.get("request_id") + if not request_id_str: + # sometimes `request_id` not in kwargs + # we try to bind the arguments + try: + bound_args = sig.bind_partial(*args, **kwargs) + arguments = bound_args.arguments + except TypeError: + arguments = {} + request_id_str = arguments.get("request_id", "") if not request_id_str: request_id_str = uuid.uuid1() if func_name == "text_to_image": diff --git a/xinference/deploy/docker/requirements.txt b/xinference/deploy/docker/requirements.txt index 4c080714f9..b07688898b 100644 --- a/xinference/deploy/docker/requirements.txt +++ b/xinference/deploy/docker/requirements.txt @@ -34,7 +34,6 @@ protobuf einops tiktoken>=0.6.0 sentence-transformers>=3.1.0 -diffusers>=0.30.0 controlnet_aux orjson auto-gptq @@ -57,7 +56,7 @@ lightning>=2.0.0 # For CosyVoice, matcha hydra-core>=1.3.2 # For CosyVoice, matcha inflect # For CosyVoice, matcha conformer # For CosyVoice, matcha -diffusers>=0.30.0 # For CosyVoice, matcha +diffusers>=0.32.0 # For CosyVoice, matcha gdown # For CosyVoice, matcha pyarrow # For CosyVoice, matcha HyperPyYAML # For CosyVoice @@ -88,6 +87,7 @@ datamodel_code_generator # for minicpm-4B jsonschema # for minicpm-4B deepcache # for sd verovio>=4.3.1 # For got_ocr2 +gguf # sglang decord diff --git a/xinference/deploy/docker/requirements_cpu.txt b/xinference/deploy/docker/requirements_cpu.txt index 5465e7e1da..e89475bf70 100644 --- a/xinference/deploy/docker/requirements_cpu.txt +++ b/xinference/deploy/docker/requirements_cpu.txt @@ -31,7 +31,6 @@ einops tiktoken sentence-transformers>=3.1.0 FlagEmbedding -diffusers>=0.30.0 controlnet_aux orjson auto-gptq @@ -54,7 +53,7 @@ lightning>=2.0.0 # For CosyVoice, matcha hydra-core>=1.3.2 # For CosyVoice, matcha inflect # For CosyVoice, matcha conformer # For CosyVoice, matcha -diffusers>=0.30.0 # For CosyVoice, matcha +diffusers>=0.32.0 # For CosyVoice, matcha gdown # For CosyVoice, matcha pyarrow # For CosyVoice, matcha HyperPyYAML # For CosyVoice @@ -82,3 +81,4 @@ qwen-vl-utils # For qwen2-vl datamodel_code_generator # for minicpm-4B jsonschema # for minicpm-4B verovio>=4.3.1 # For got_ocr2 +gguf diff --git a/xinference/model/image/core.py b/xinference/model/image/core.py index 432a70c1a4..cfaa11070a 100644 --- a/xinference/model/image/core.py +++ b/xinference/model/image/core.py @@ -22,7 +22,12 @@ from ...constants import XINFERENCE_CACHE_DIR from ...types import PeftModelConfig from ..core import CacheableModelSpec, ModelDescription -from ..utils import valid_model_revision +from ..utils import ( + IS_NEW_HUGGINGFACE_HUB, + retry_download, + symlink_local_file, + valid_model_revision, +) from .ocr.got_ocr2 import GotOCR2Model from .stable_diffusion.core import DiffusionModel from .stable_diffusion.mlx import MLXDiffusionModel @@ -51,6 +56,9 @@ class ImageModelFamilyV1(CacheableModelSpec): controlnet: Optional[List["ImageModelFamilyV1"]] default_model_config: Optional[dict] = {} default_generate_config: Optional[dict] = {} + gguf_model_id: Optional[str] + gguf_quantizations: Optional[List[str]] + gguf_model_file_name_template: Optional[str] class ImageModelDescription(ModelDescription): @@ -187,6 +195,61 @@ def get_cache_status( return valid_model_revision(meta_path, model_spec.model_revision) +def cache_gguf(spec: ImageModelFamilyV1, quantization: Optional[str] = None): + if not quantization: + return + + cache_dir = os.path.realpath(os.path.join(XINFERENCE_CACHE_DIR, spec.model_name)) + if not os.path.exists(cache_dir): + os.makedirs(cache_dir, exist_ok=True) + + if not spec.gguf_model_file_name_template: + raise NotImplementedError( + f"{spec.model_name} does not support GGUF quantization" + ) + if quantization not in (spec.gguf_quantizations or []): + raise ValueError( + f"Cannot support quantization {quantization}, " + f"available quantizations: {spec.gguf_quantizations}" + ) + + filename = spec.gguf_model_file_name_template.format(quantization=quantization) # type: ignore + full_path = os.path.join(cache_dir, filename) + + if spec.model_hub == "huggingface": + import huggingface_hub + + use_symlinks = {} + if not IS_NEW_HUGGINGFACE_HUB: + use_symlinks = {"local_dir_use_symlinks": True, "local_dir": cache_dir} + download_file_path = retry_download( + huggingface_hub.hf_hub_download, + spec.model_name, + None, + spec.gguf_model_id, + filename=filename, + **use_symlinks, + ) + if IS_NEW_HUGGINGFACE_HUB: + symlink_local_file(download_file_path, cache_dir, filename) + elif spec.model_hub == "modelscope": + from modelscope.hub.file_download import model_file_download + + download_file_path = retry_download( + model_file_download, + spec.model_name, + None, + spec.gguf_model_id, + filename, + revision=spec.model_revision, + ) + symlink_local_file(download_file_path, cache_dir, filename) + else: + raise NotImplementedError + + return full_path + + def create_ocr_model_instance( subpool_addr: str, devices: List[str], @@ -219,6 +282,8 @@ def create_image_model_instance( Literal["huggingface", "modelscope", "openmind_hub", "csghub"] ] = None, model_path: Optional[str] = None, + gguf_quantization: Optional[str] = None, + gguf_model_path: Optional[str] = None, **kwargs, ) -> Tuple[ Union[DiffusionModel, MLXDiffusionModel, GotOCR2Model], ImageModelDescription @@ -272,6 +337,8 @@ def create_image_model_instance( ] if not model_path: model_path = cache(model_spec) + if not gguf_model_path and gguf_quantization: + gguf_model_path = cache_gguf(model_spec, gguf_quantization) if peft_model_config is not None: lora_model = peft_model_config.peft_model lora_load_kwargs = peft_model_config.image_lora_load_kwargs @@ -298,6 +365,7 @@ def create_image_model_instance( lora_load_kwargs=lora_load_kwargs, lora_fuse_kwargs=lora_fuse_kwargs, model_spec=model_spec, + gguf_model_path=gguf_model_path, **kwargs, ) model_description = ImageModelDescription( diff --git a/xinference/model/image/model_spec.json b/xinference/model/image/model_spec.json index 24933cb99e..9d0b071a34 100644 --- a/xinference/model/image/model_spec.json +++ b/xinference/model/image/model_spec.json @@ -11,8 +11,24 @@ ], "default_model_config": { "quantize": true, - "quantize_text_encoder": "text_encoder_2" - } + "quantize_text_encoder": "text_encoder_2", + "torch_dtype": "bfloat16" + }, + "gguf_model_id": "city96/FLUX.1-schnell-gguf", + "gguf_quantizations": [ + "F16", + "Q2_K", + "Q3_K_S", + "Q4_0", + "Q4_1", + "Q4_K_S", + "Q5_0", + "Q5_1", + "Q5_K_S", + "Q6_K", + "Q8_0" + ], + "gguf_model_file_name_template": "flux1-schnell-{quantization}.gguf" }, { "model_name": "FLUX.1-dev", @@ -26,8 +42,24 @@ ], "default_model_config": { "quantize": true, - "quantize_text_encoder": "text_encoder_2" - } + "quantize_text_encoder": "text_encoder_2", + "torch_dtype": "bfloat16" + }, + "gguf_model_id": "city96/FLUX.1-dev-gguf", + "gguf_quantizations": [ + "F16", + "Q2_K", + "Q3_K_S", + "Q4_0", + "Q4_1", + "Q4_K_S", + "Q5_0", + "Q5_1", + "Q5_K_S", + "Q6_K", + "Q8_0" + ], + "gguf_model_file_name_template": "flux1-dev-{quantization}.gguf" }, { "model_name": "sd3-medium", @@ -44,6 +76,97 @@ "quantize_text_encoder": "text_encoder_3" } }, + { + "model_name": "sd3.5-medium", + "model_family": "stable_diffusion", + "model_id": "stabilityai/stable-diffusion-3.5-medium", + "model_revision": "94b13ccbe959c51e8159d91f562c58f29fac971a", + "model_ability": [ + "text2image", + "image2image", + "inpainting" + ], + "default_model_config": { + "quantize": true, + "quantize_text_encoder": "text_encoder_3", + "torch_dtype": "bfloat16" + }, + "gguf_model_id": "city96/stable-diffusion-3.5-medium-gguf", + "gguf_quantizations": [ + "F16", + "Q3_K_M", + "Q3_K_S", + "Q4_0", + "Q4_1", + "Q4_K_M", + "Q4_K_S", + "Q5_0", + "Q5_1", + "Q5_K_M", + "Q5_K_S", + "Q6_K", + "Q8_0" + ], + "gguf_model_file_name_template": "sd3.5_medium-{quantization}.gguf" + }, + { + "model_name": "sd3.5-large", + "model_family": "stable_diffusion", + "model_id": "stabilityai/stable-diffusion-3.5-large", + "model_revision": "ceddf0a7fdf2064ea28e2213e3b84e4afa170a0f", + "model_ability": [ + "text2image", + "image2image", + "inpainting" + ], + "default_model_config": { + "quantize": true, + "quantize_text_encoder": "text_encoder_3", + "torch_dtype": "bfloat16", + "transformer_nf4": true + }, + "gguf_model_id": "city96/stable-diffusion-3.5-large-gguf", + "gguf_quantizations": [ + "F16", + "Q4_0", + "Q4_1", + "Q5_0", + "Q5_1", + "Q8_0" + ], + "gguf_model_file_name_template": "sd3.5_large-{quantization}.gguf" + }, + { + "model_name": "sd3.5-large-turbo", + "model_family": "stable_diffusion", + "model_id": "stabilityai/stable-diffusion-3.5-large-turbo", + "model_revision": "ec07796fc06b096cc56de9762974a28f4c632eda", + "model_ability": [ + "text2image", + "image2image", + "inpainting" + ], + "default_model_config": { + "quantize": true, + "quantize_text_encoder": "text_encoder_3", + "torch_dtype": "bfloat16", + "transformer_nf4": true + }, + "default_generate_config": { + "guidance_scale": 1.0, + "num_inference_steps": 4 + }, + "gguf_model_id": "city96/stable-diffusion-3.5-large-turbo-gguf", + "gguf_quantizations": [ + "F16", + "Q4_0", + "Q4_1", + "Q5_0", + "Q5_1", + "Q8_0" + ], + "gguf_model_file_name_template": "sd3.5_large_turbo-{quantization}.gguf" + }, { "model_name": "sd-turbo", "model_family": "stable_diffusion", diff --git a/xinference/model/image/model_spec_modelscope.json b/xinference/model/image/model_spec_modelscope.json index ad8af7a26f..391749161c 100644 --- a/xinference/model/image/model_spec_modelscope.json +++ b/xinference/model/image/model_spec_modelscope.json @@ -12,8 +12,24 @@ ], "default_model_config": { "quantize": true, - "quantize_text_encoder": "text_encoder_2" - } + "quantize_text_encoder": "text_encoder_2", + "torch_dtype": "bfloat16" + }, + "gguf_model_id": "Xorbits/FLUX.1-schnell-gguf", + "gguf_quantizations": [ + "F16", + "Q2_K", + "Q3_K_S", + "Q4_0", + "Q4_1", + "Q4_K_S", + "Q5_0", + "Q5_1", + "Q5_K_S", + "Q6_K", + "Q8_0" + ], + "gguf_model_file_name_template": "flux1-schnell-{quantization}.gguf" }, { "model_name": "FLUX.1-dev", @@ -28,8 +44,24 @@ ], "default_model_config": { "quantize": true, - "quantize_text_encoder": "text_encoder_2" - } + "quantize_text_encoder": "text_encoder_2", + "torch_dtype": "bfloat16" + }, + "gguf_model_id": "AI-ModelScope/FLUX.1-dev-gguf", + "gguf_quantizations": [ + "F16", + "Q2_K", + "Q3_K_S", + "Q4_0", + "Q4_1", + "Q4_K_S", + "Q5_0", + "Q5_1", + "Q5_K_S", + "Q6_K", + "Q8_0" + ], + "gguf_model_file_name_template": "flux1-dev-{quantization}.gguf" }, { "model_name": "sd3-medium", @@ -47,6 +79,100 @@ "quantize_text_encoder": "text_encoder_3" } }, + { + "model_name": "sd3.5-medium", + "model_family": "stable_diffusion", + "model_hub": "modelscope", + "model_id": "AI-ModelScope/stable-diffusion-3.5-medium", + "model_revision": "master", + "model_ability": [ + "text2image", + "image2image", + "inpainting" + ], + "default_model_config": { + "quantize": true, + "quantize_text_encoder": "text_encoder_3", + "torch_dtype": "bfloat16" + }, + "gguf_model_id": "Xorbits/stable-diffusion-3.5-medium-gguf", + "gguf_quantizations": [ + "F16", + "Q3_K_M", + "Q3_K_S", + "Q4_0", + "Q4_1", + "Q4_K_M", + "Q4_K_S", + "Q5_0", + "Q5_1", + "Q5_K_M", + "Q5_K_S", + "Q6_K", + "Q8_0" + ], + "gguf_model_file_name_template": "sd3.5_medium-{quantization}.gguf" + }, + { + "model_name": "sd3.5-large", + "model_family": "stable_diffusion", + "model_hub": "modelscope", + "model_id": "AI-ModelScope/stable-diffusion-3.5-large", + "model_revision": "master", + "model_ability": [ + "text2image", + "image2image", + "inpainting" + ], + "default_model_config": { + "quantize": true, + "quantize_text_encoder": "text_encoder_3", + "torch_dtype": "bfloat16", + "transformer_nf4": true + }, + "gguf_model_id": "Xorbits/stable-diffusion-3.5-large-gguf", + "gguf_quantizations": [ + "F16", + "Q4_0", + "Q4_1", + "Q5_0", + "Q5_1", + "Q8_0" + ], + "gguf_model_file_name_template": "sd3.5_large-{quantization}.gguf" + }, + { + "model_name": "sd3.5-large-turbo", + "model_family": "stable_diffusion", + "model_hub": "modelscope", + "model_id": "AI-ModelScope/stable-diffusion-3.5-large-turbo", + "model_revision": "master", + "model_ability": [ + "text2image", + "image2image", + "inpainting" + ], + "default_model_config": { + "quantize": true, + "quantize_text_encoder": "text_encoder_3", + "torch_dtype": "bfloat16", + "transformer_nf4": true + }, + "default_generate_config": { + "guidance_scale": 1.0, + "num_inference_steps": 4 + }, + "gguf_model_id": "Xorbits/stable-diffusion-3.5-large-turbo-gguf", + "gguf_quantizations": [ + "F16", + "Q4_0", + "Q4_1", + "Q5_0", + "Q5_1", + "Q8_0" + ], + "gguf_model_file_name_template": "sd3.5_large_turbo-{quantization}.gguf" + }, { "model_name": "sd-turbo", "model_family": "stable_diffusion", diff --git a/xinference/model/image/stable_diffusion/core.py b/xinference/model/image/stable_diffusion/core.py index e0f7e5c886..3330fd9395 100644 --- a/xinference/model/image/stable_diffusion/core.py +++ b/xinference/model/image/stable_diffusion/core.py @@ -14,8 +14,10 @@ import contextlib import gc +import importlib import inspect import itertools +import json import logging import os import re @@ -86,6 +88,7 @@ def __init__( lora_load_kwargs: Optional[Dict] = None, lora_fuse_kwargs: Optional[Dict] = None, model_spec: Optional["ImageModelFamilyV1"] = None, + gguf_model_path: Optional[str] = None, **kwargs, ): self._model_uid = model_uid @@ -109,6 +112,8 @@ def __init__( self._model_spec = model_spec self._abilities = model_spec.model_ability or [] # type: ignore self._kwargs = kwargs + # gguf + self._gguf_model_path = gguf_model_path @property def model_ability(self): @@ -184,7 +189,17 @@ def _apply_lora(self): self._model.fuse_lora(**self._lora_fuse_kwargs) logger.info(f"Successfully loaded the LoRA for model {self._model_uid}.") + def _get_layer_cls(self, layer: str): + with open(os.path.join(self._model_path, "model_index.json")) as f: # type: ignore + model_index = json.load(f) + layer_info = model_index[layer] + module_name, class_name = layer_info + module = importlib.import_module(module_name) + return getattr(module, class_name) + def load(self): + from transformers import BitsAndBytesConfig, T5EncoderModel + if "text2image" in self._abilities or "image2image" in self._abilities: from diffusers import AutoPipelineForText2Image as AutoPipelineModel elif "inpainting" in self._abilities: @@ -200,7 +215,9 @@ def load(self): glob(os.path.join(self._model_path, "*/*.safetensors")) ) if isinstance(torch_dtype, str): - self._kwargs["torch_dtype"] = getattr(torch, torch_dtype) + self._torch_dtype = torch_dtype = self._kwargs["torch_dtype"] = getattr( + torch, torch_dtype + ) controlnet = self._kwargs.get("controlnet") if controlnet is not None: @@ -212,18 +229,7 @@ def load(self): ] quantize_text_encoder = self._kwargs.pop("quantize_text_encoder", None) - if quantize_text_encoder: - try: - from transformers import BitsAndBytesConfig, T5EncoderModel - except ImportError: - error_message = "Failed to import module 'transformers'" - installation_guide = [ - "Please make sure 'transformers' is installed. ", - "You can install it by `pip install transformers`\n", - ] - - raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}") - + if quantize_text_encoder and not self._gguf_model_path: try: import bitsandbytes # noqa: F401 except ImportError: @@ -249,6 +255,32 @@ def load(self): self._kwargs[text_encoder_name] = text_encoder self._kwargs["device_map"] = "balanced" + if self._gguf_model_path: + from diffusers import GGUFQuantizationConfig + + # GGUF transformer + self._kwargs["transformer"] = self._get_layer_cls( + "transformer" + ).from_single_file( + self._gguf_model_path, + quantization_config=GGUFQuantizationConfig(compute_dtype=torch_dtype), + torch_dtype=torch_dtype, + config=os.path.join(self._model_path, "transformer"), + ) + elif self._kwargs.get("transformer_nf4"): + nf4_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_quant_type="nf4", + bnb_4bit_compute_dtype=torch_dtype, + ) + model_nf4 = self._get_layer_cls("transformer").from_pretrained( + self._model_path, + subfolder="transformer", + quantization_config=nf4_config, + torch_dtype=torch_dtype, + ) + self._kwargs["transformer"] = model_nf4 + logger.debug( "Loading model from %s, kwargs: %s", self._model_path, self._kwargs )