From d3428697115cc4666b38b32925ba28bdc1a21957 Mon Sep 17 00:00:00 2001
From: Xuye Qin <qinxuye@gmail.com>
Date: Fri, 27 Dec 2024 18:14:37 +0800
Subject: [PATCH] FEAT: support SD3.5 series model (#2706)

---
 doc/source/gen_docs.py                        |   1 +
 .../models/model_abilities/image.po           | 247 +++++++++++++++---
 .../models/builtin/audio/cosyvoice2-0.5b.rst  |  19 ++
 .../models/builtin/audio/f5-tts-mlx.rst       |  19 ++
 .../models/builtin/audio/fishspeech-1.5.rst   |  19 ++
 doc/source/models/builtin/audio/index.rst     |   6 +-
 .../models/builtin/image/flux.1-dev.rst       |  10 +-
 .../models/builtin/image/flux.1-schnell.rst   |  10 +-
 .../models/builtin/image/got-ocr2_0.rst       |   3 +-
 doc/source/models/builtin/image/index.rst     |   6 +
 doc/source/models/builtin/image/kolors.rst    |   3 +-
 doc/source/models/builtin/image/sd-turbo.rst  |   3 +-
 .../models/builtin/image/sd3-medium.rst       |   3 +-
 .../builtin/image/sd3.5-large-turbo.rst       |  27 ++
 .../models/builtin/image/sd3.5-large.rst      |  27 ++
 .../models/builtin/image/sd3.5-medium.rst     |  27 ++
 .../models/builtin/image/sdxl-turbo.rst       |   3 +-
 .../image/stable-diffusion-2-inpainting.rst   |   3 +-
 .../image/stable-diffusion-inpainting.rst     |   3 +-
 .../builtin/image/stable-diffusion-v1.5.rst   |   3 +-
 .../image/stable-diffusion-xl-base-1.0.rst    |   3 +-
 .../image/stable-diffusion-xl-inpainting.rst  |   3 +-
 doc/source/models/model_abilities/image.rst   | 108 ++++++--
 doc/templates/image.rst.jinja                 |  12 +-
 setup.cfg                                     |  11 +-
 xinference/core/utils.py                      |  16 +-
 xinference/deploy/docker/requirements.txt     |   4 +-
 xinference/deploy/docker/requirements_cpu.txt |   4 +-
 xinference/model/image/core.py                |  70 ++++-
 xinference/model/image/model_spec.json        | 131 +++++++++-
 .../model/image/model_spec_modelscope.json    | 134 +++++++++-
 .../model/image/stable_diffusion/core.py      |  58 +++-
 32 files changed, 884 insertions(+), 112 deletions(-)
 create mode 100644 doc/source/models/builtin/audio/cosyvoice2-0.5b.rst
 create mode 100644 doc/source/models/builtin/audio/f5-tts-mlx.rst
 create mode 100644 doc/source/models/builtin/audio/fishspeech-1.5.rst
 create mode 100644 doc/source/models/builtin/image/sd3.5-large-turbo.rst
 create mode 100644 doc/source/models/builtin/image/sd3.5-large.rst
 create mode 100644 doc/source/models/builtin/image/sd3.5-medium.rst

diff --git a/doc/source/gen_docs.py b/doc/source/gen_docs.py
index 68e381edf6..380d57bb8b 100644
--- a/doc/source/gen_docs.py
+++ b/doc/source/gen_docs.py
@@ -203,6 +203,7 @@ def get_unique_id(spec):
                 available_controlnet = None
             model["available_controlnet"] = available_controlnet
             model["model_ability"] = ', '.join(model.get("model_ability"))
+            model["gguf_quantizations"] = ", ".join(model.get("gguf_quantizations", []))
             rendered = env.get_template('image.rst.jinja').render(model)
             output_file_path = os.path.join(output_dir, f"{model['model_name'].lower()}.rst")
             with open(output_file_path, 'w') as output_file:
diff --git a/doc/source/locale/zh_CN/LC_MESSAGES/models/model_abilities/image.po b/doc/source/locale/zh_CN/LC_MESSAGES/models/model_abilities/image.po
index e73ba213b0..9c61d12faa 100644
--- a/doc/source/locale/zh_CN/LC_MESSAGES/models/model_abilities/image.po
+++ b/doc/source/locale/zh_CN/LC_MESSAGES/models/model_abilities/image.po
@@ -8,7 +8,7 @@ msgid ""
 msgstr ""
 "Project-Id-Version: Xinference \n"
 "Report-Msgid-Bugs-To: \n"
-"POT-Creation-Date: 2024-10-30 07:49+0000\n"
+"POT-Creation-Date: 2024-12-26 18:49+0800\n"
 "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
 "Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
 "Language: zh_CN\n"
@@ -17,7 +17,7 @@ msgstr ""
 "MIME-Version: 1.0\n"
 "Content-Type: text/plain; charset=utf-8\n"
 "Content-Transfer-Encoding: 8bit\n"
-"Generated-By: Babel 2.16.0\n"
+"Generated-By: Babel 2.14.0\n"
 
 #: ../../source/models/model_abilities/image.rst:5
 msgid "Images"
@@ -98,26 +98,48 @@ msgid "stable-diffusion-xl-base-1.0"
 msgstr ""
 
 #: ../../source/models/model_abilities/image.rst:43
+#: ../../source/models/model_abilities/image.rst:149
 msgid "sd3-medium"
 msgstr ""
 
 #: ../../source/models/model_abilities/image.rst:44
-msgid "FLUX.1-schnell"
+#: ../../source/models/model_abilities/image.rst:151
+#: ../../source/models/model_abilities/image.rst:180
+msgid "sd3.5-medium"
 msgstr ""
 
 #: ../../source/models/model_abilities/image.rst:45
+#: ../../source/models/model_abilities/image.rst:153
+#: ../../source/models/model_abilities/image.rst:182
+msgid "sd3.5-large"
+msgstr ""
+
+#: ../../source/models/model_abilities/image.rst:46
+#: ../../source/models/model_abilities/image.rst:155
+msgid "sd3.5-large-turbo"
+msgstr ""
+
+#: ../../source/models/model_abilities/image.rst:47
+#: ../../source/models/model_abilities/image.rst:147
+#: ../../source/models/model_abilities/image.rst:178
+msgid "FLUX.1-schnell"
+msgstr ""
+
+#: ../../source/models/model_abilities/image.rst:48
+#: ../../source/models/model_abilities/image.rst:145
+#: ../../source/models/model_abilities/image.rst:176
 msgid "FLUX.1-dev"
 msgstr ""
 
-#: ../../source/models/model_abilities/image.rst:49
+#: ../../source/models/model_abilities/image.rst:52
 msgid "Quickstart"
 msgstr "快速入门"
 
-#: ../../source/models/model_abilities/image.rst:52
+#: ../../source/models/model_abilities/image.rst:55
 msgid "Text-to-image"
 msgstr "文生图"
 
-#: ../../source/models/model_abilities/image.rst:54
+#: ../../source/models/model_abilities/image.rst:57
 msgid ""
 "The Text-to-image API mimics OpenAI's `create images API "
 "<https://platform.openai.com/docs/api-reference/images/create>`_. We can "
@@ -127,15 +149,26 @@ msgstr ""
 "可以通过 cURL、OpenAI Client 或 Xinference 的方式尝试使用 Text-to-image "
 "API。"
 
-#: ../../source/models/model_abilities/image.rst:109
-msgid "Tips for Large Image Models including SD3-Medium, FLUX.1"
-msgstr "大型图像模型部署（sd3-medium、FLUX.1 系列）贴士"
+#: ../../source/models/model_abilities/image.rst:112
+msgid "Quantize Large Image Models e.g. SD3-Medium, FLUX.1"
+msgstr "量化大型图像模型（sd3-medium、FLUX.1 系列等）"
 
-#: ../../source/models/model_abilities/image.rst:111
+#: ../../source/models/model_abilities/image.rst:116
+msgid ""
+"From v0.16.1, Xinference by default enabled quantization for large image "
+"models like Flux.1 and SD3.5 series. So if your Xinference version is "
+"newer than v0.16.1, You barely need to do anything to run those large "
+"image models on GPUs with small memory."
+msgstr ""
+"从 v0.16.1 开始，Xinference 默认对大图像模型如 Flux.1 和 SD3.5 系列开启"
+"量化。如果你使用新于 v0.16.1 的 Xinference 版本，你不需要做什么事情来在小"
+" GPU 显存的机器上来运行这些大型图像模型。"
+
+#: ../../source/models/model_abilities/image.rst:121
 msgid "Useful extra parameters can be passed to launch including:"
 msgstr "有用的传递给加载模型的额外参数包括："
 
-#: ../../source/models/model_abilities/image.rst:113
+#: ../../source/models/model_abilities/image.rst:123
 msgid ""
 "``--cpu_offload True``: specifying ``True`` will offload the components "
 "of the model to CPU during inference in order to save memory, while "
@@ -147,7 +180,7 @@ msgstr ""
 "CPU 上以节省内存，这会导致推理延迟略有增加。模型卸载仅会在需要执行时将"
 "模型组件移动到 GPU 上，同时保持其余组件在 CPU 上"
 
-#: ../../source/models/model_abilities/image.rst:117
+#: ../../source/models/model_abilities/image.rst:127
 msgid ""
 "``--quantize_text_encoder <text encoder layer>``: We leveraged the "
 "``bitsandbytes`` library to load and quantize the T5-XXL text encoder to "
@@ -158,7 +191,7 @@ msgstr ""
 "`` 库加载并量化 T5-XXL 文本编码器至8位精度。这使得你能够在仅轻微影响性能"
 "的情况下继续使用全部文本编码器。"
 
-#: ../../source/models/model_abilities/image.rst:120
+#: ../../source/models/model_abilities/image.rst:130
 msgid ""
 "``--text_encoder_3 None``, for sd3-medium, removing the memory-intensive "
 "4.7B parameter T5-XXL text encoder during inference can significantly "
@@ -167,53 +200,195 @@ msgstr ""
 "``--text_encoder_3 None``，对于 sd3-medium，移除在推理过程中内存密集型的"
 "47亿参数T5-XXL文本编码器可以显著降低内存需求，而仅造成性能上的轻微损失。"
 
-#: ../../source/models/model_abilities/image.rst:124
+#: ../../source/models/model_abilities/image.rst:133
+msgid "``--transformer_nf4 True``: use nf4 for transformer quantization."
+msgstr "``--transformer_nf4 True`` ：使用 nf4 量化 transformer。"
+
+#: ../../source/models/model_abilities/image.rst:134
 msgid ""
-"If you are trying to run large image models liek sd3-medium or FLUX.1 "
-"series on GPU card that has less memory than 24GB, you may encounter OOM "
-"when launching or inference. Try below solutions."
+"``--quantize``: Only work for MLX on Mac, Flux.1-dev and Flux.1-schnell "
+"will switch to MLX engine on Mac, and ``quantize`` can be used to "
+"quantize the model."
 msgstr ""
-"如果你试图在显存小于24GB的GPU上运行像sd3-medium或FLUX.1系列这样的大型图像"
-"模型，你在启动或推理过程中可能会遇到显存溢出（OOM）的问题。尝试以下"
-"解决方案。"
+"``--quantize`` ：只对 Mac 上的 MLX 引擎生效，Flux.1-dev 和 Flux.1-schnell"
+"会在 Mac 上使用 MLX 引擎计算，``quantize`` 可以用来量化模型。"
 
-#: ../../source/models/model_abilities/image.rst:128
-msgid "For FLUX.1 series, try to apply quantization."
-msgstr "对于 FLUX.1 系列，尝试应用量化。"
+#: ../../source/models/model_abilities/image.rst:137
+msgid ""
+"For WebUI, Just add additional parameters, e.g. add key ``cpu_offload`` "
+"and value ``True`` to enable cpu offloading."
+msgstr ""
+"对于 WebUI，只需要添加额外参数，比如，添加 key ``cpu_offload`` 以及值 ``"
+"True`` 来开启 CPU 卸载。"
 
-#: ../../source/models/model_abilities/image.rst:134
-msgid "For sd3-medium, apply quantization to ``text_encoder_3``."
-msgstr "对于 sd3-medium 模型，对 ``text_encoder_3`` 应用量化。"
+#: ../../source/models/model_abilities/image.rst:140
+msgid "Below list default options that used from v0.16.1."
+msgstr "如下列出了从 v0.16.1 开始默认使用的参数。"
+
+#: ../../source/models/model_abilities/image.rst:143
+#: ../../source/models/model_abilities/image.rst:174
+msgid "Model"
+msgstr "模型"
+
+#: ../../source/models/model_abilities/image.rst:143
+msgid "quantize_text_encoder"
+msgstr ""
 
-#: ../../source/models/model_abilities/image.rst:141
-msgid "Or removing memory-intensive T5-XXL text encoder for sd3-medium."
-msgstr "或者，移除 sd3-medium 模型中内存密集型的 T5-XXL 文本编码器。"
+#: ../../source/models/model_abilities/image.rst:143
+msgid "quantize"
+msgstr ""
+
+#: ../../source/models/model_abilities/image.rst:143
+msgid "transformer_nf4"
+msgstr ""
+
+#: ../../source/models/model_abilities/image.rst:145
+#: ../../source/models/model_abilities/image.rst:147
+msgid "text_encoder_2"
+msgstr ""
+
+#: ../../source/models/model_abilities/image.rst:145
+#: ../../source/models/model_abilities/image.rst:147
+#: ../../source/models/model_abilities/image.rst:153
+#: ../../source/models/model_abilities/image.rst:155
+msgid "True"
+msgstr ""
 
-#: ../../source/models/model_abilities/image.rst:148
+#: ../../source/models/model_abilities/image.rst:145
+#: ../../source/models/model_abilities/image.rst:147
+#: ../../source/models/model_abilities/image.rst:149
+#: ../../source/models/model_abilities/image.rst:151
+msgid "False"
+msgstr ""
+
+#: ../../source/models/model_abilities/image.rst:149
+#: ../../source/models/model_abilities/image.rst:151
+#: ../../source/models/model_abilities/image.rst:153
+#: ../../source/models/model_abilities/image.rst:155
+msgid "text_encoder_3"
+msgstr ""
+
+#: ../../source/models/model_abilities/image.rst:149
+#: ../../source/models/model_abilities/image.rst:151
+#: ../../source/models/model_abilities/image.rst:153
+#: ../../source/models/model_abilities/image.rst:155
+msgid "N/A"
+msgstr ""
+
+#: ../../source/models/model_abilities/image.rst:160
+msgid ""
+"If you want to disable some quantization, just set the corresponding "
+"option to False. e.g. for Web UI, set key ``quantize_text_encoder`` and "
+"value ``False`` and for command line, specify ``--quantize_text_encoder "
+"False`` to disable quantization for text encoder."
+msgstr ""
+"如果你想关闭某些量化，只需要设置相应的选项为 False。比如，对于 Web UI，"
+"设置 key ``quantize_text_encoder`` 和值 ``False``，或对于命令行，指定 ``"
+"--quantize_text_encoder False`` 来关闭 text encoder 的量化。"
+
+#: ../../source/models/model_abilities/image.rst:166
+msgid "GGUF file format"
+msgstr "GGUF 文件格式"
+
+#: ../../source/models/model_abilities/image.rst:168
+msgid ""
+"GGUF file format for transformer provides various quantization options. "
+"To use gguf file, you can specify additional option ``gguf_quantization``"
+" for web UI, or ``--gguf_quantization`` for command line for those image "
+"models which support internally by Xinference. Below is the mode list."
+msgstr ""
+"GGUF 文件格式为 transformer 模块提供了丰富的量化选项。要使用 GGUF 文件，"
+"你可以在 Web 界面上指定额外选项 ``gguf_quantization`` ，或者在命令行指定 "
+"``--gguf_quantization`` ，以为 Xinference 内建支持 GGUF 量化的模型开启。"
+"如下是内置支持的模型。"
+
+#: ../../source/models/model_abilities/image.rst:174
+msgid "supported gguf quantization"
+msgstr "支持 GGUF 量化格式"
+
+#: ../../source/models/model_abilities/image.rst:176
+#: ../../source/models/model_abilities/image.rst:178
+msgid "F16, Q2_K, Q3_K_S, Q4_0, Q4_1, Q4_K_S, Q5_0, Q5_1, Q5_K_S, Q6_K, Q8_0"
+msgstr ""
+
+#: ../../source/models/model_abilities/image.rst:187
+msgid ""
+"We stronly recommend to enable additional option ``cpu_offload`` with "
+"value ``True`` for WebUI, or specify ``--cpu_offload True`` for command "
+"line."
+msgstr ""
+"我们强烈推荐在 WebUI 上开启额外选项 ``cpu_offload`` 并指定为 ``True``，或"
+"对命令行，指定 ``--cpu_offload True``。"
+
+#: ../../source/models/model_abilities/image.rst:190
+msgid "Example:"
+msgstr "例如："
+
+#: ../../source/models/model_abilities/image.rst:196
+msgid ""
+"With ``Q2_K`` quantization, you only need around 5 GiB GPU memory to run "
+"Flux.1-dev."
+msgstr ""
+"使用 ``Q2_K`` 量化，你只需要大约 5GB 的显存来运行 Flux.1-dev。"
+
+#: ../../source/models/model_abilities/image.rst:198
+msgid ""
+"For those models gguf options are not supported internally, or you want "
+"to download gguf files on you own, you can specify additional option "
+"``gguf_model_path`` for web UI or spcecify ``--gguf_model_path "
+"/path/to/model_quant.gguf`` for command line."
+msgstr ""
+"对于非内建支持 GGUF 量化的模型，或者你希望自己下载 GGUF 文件，你可以在 "
+"Web UI 指定额外选项 ``gguf_model_path`` 或者用命令行指定 ``--gguf_model_"
+"path /path/to/model_quant.gguf`` 。"
+
+#: ../../source/models/model_abilities/image.rst:204
 msgid "Image-to-image"
 msgstr "图生图"
 
-#: ../../source/models/model_abilities/image.rst:150
+#: ../../source/models/model_abilities/image.rst:206
 msgid "You can find more examples of Images API in the tutorial notebook:"
 msgstr "你可以在教程笔记本中找到更多 Images API 的示例。"
 
-#: ../../source/models/model_abilities/image.rst:154
+#: ../../source/models/model_abilities/image.rst:210
 msgid "Stable Diffusion ControlNet"
 msgstr ""
 
-#: ../../source/models/model_abilities/image.rst:157
+#: ../../source/models/model_abilities/image.rst:213
 msgid "Learn from a Stable Diffusion ControlNet example"
 msgstr "学习一个 Stable Diffusion 控制网络的示例"
 
-#: ../../source/models/model_abilities/image.rst:160
+#: ../../source/models/model_abilities/image.rst:216
 msgid "OCR"
 msgstr ""
 
-#: ../../source/models/model_abilities/image.rst:162
+#: ../../source/models/model_abilities/image.rst:218
 msgid "The OCR API accepts image bytes and returns the OCR text."
 msgstr "OCR API 接受图像字节并返回 OCR 文本。"
 
-#: ../../source/models/model_abilities/image.rst:164
+#: ../../source/models/model_abilities/image.rst:220
 msgid "We can try OCR API out either via cURL, or Xinference's python client:"
 msgstr "可以通过 cURL 或 Xinference 的 Python 客户端来尝试 OCR API。"
 
+#~ msgid ""
+#~ "If you are trying to run large "
+#~ "image models liek sd3-medium or FLUX.1"
+#~ " series on GPU card that has "
+#~ "less memory than 24GB, you may "
+#~ "encounter OOM when launching or "
+#~ "inference. Try below solutions."
+#~ msgstr ""
+#~ "如果你试图在显存小于24GB的GPU上运行像"
+#~ "sd3-medium或FLUX.1系列这样的大型图像模型"
+#~ "，你在启动或推理过程中可能会遇到显存"
+#~ "溢出（OOM）的问题。尝试以下解决方案。"
+
+#~ msgid "For FLUX.1 series, try to apply quantization."
+#~ msgstr "对于 FLUX.1 系列，尝试应用量化。"
+
+#~ msgid "For sd3-medium, apply quantization to ``text_encoder_3``."
+#~ msgstr "对于 sd3-medium 模型，对 ``text_encoder_3`` 应用量化。"
+
+#~ msgid "Or removing memory-intensive T5-XXL text encoder for sd3-medium."
+#~ msgstr "或者，移除 sd3-medium 模型中内存密集型的 T5-XXL 文本编码器。"
+
diff --git a/doc/source/models/builtin/audio/cosyvoice2-0.5b.rst b/doc/source/models/builtin/audio/cosyvoice2-0.5b.rst
new file mode 100644
index 0000000000..781da78852
--- /dev/null
+++ b/doc/source/models/builtin/audio/cosyvoice2-0.5b.rst
@@ -0,0 +1,19 @@
+.. _models_builtin_cosyvoice2-0.5b:
+
+===============
+CosyVoice2-0.5B
+===============
+
+- **Model Name:** CosyVoice2-0.5B
+- **Model Family:** CosyVoice
+- **Abilities:** text-to-audio
+- **Multilingual:** True
+
+Specifications
+^^^^^^^^^^^^^^
+
+- **Model ID:** mrfakename/CosyVoice2-0.5B
+
+Execute the following command to launch the model::
+
+   xinference launch --model-name CosyVoice2-0.5B --model-type audio
\ No newline at end of file
diff --git a/doc/source/models/builtin/audio/f5-tts-mlx.rst b/doc/source/models/builtin/audio/f5-tts-mlx.rst
new file mode 100644
index 0000000000..7ef102eb19
--- /dev/null
+++ b/doc/source/models/builtin/audio/f5-tts-mlx.rst
@@ -0,0 +1,19 @@
+.. _models_builtin_f5-tts-mlx:
+
+==========
+F5-TTS-MLX
+==========
+
+- **Model Name:** F5-TTS-MLX
+- **Model Family:** F5-TTS-MLX
+- **Abilities:** text-to-audio
+- **Multilingual:** True
+
+Specifications
+^^^^^^^^^^^^^^
+
+- **Model ID:** lucasnewman/f5-tts-mlx
+
+Execute the following command to launch the model::
+
+   xinference launch --model-name F5-TTS-MLX --model-type audio
\ No newline at end of file
diff --git a/doc/source/models/builtin/audio/fishspeech-1.5.rst b/doc/source/models/builtin/audio/fishspeech-1.5.rst
new file mode 100644
index 0000000000..83b0b7d001
--- /dev/null
+++ b/doc/source/models/builtin/audio/fishspeech-1.5.rst
@@ -0,0 +1,19 @@
+.. _models_builtin_fishspeech-1.5:
+
+==============
+FishSpeech-1.5
+==============
+
+- **Model Name:** FishSpeech-1.5
+- **Model Family:** FishAudio
+- **Abilities:** text-to-audio
+- **Multilingual:** True
+
+Specifications
+^^^^^^^^^^^^^^
+
+- **Model ID:** fishaudio/fish-speech-1.5
+
+Execute the following command to launch the model::
+
+   xinference launch --model-name FishSpeech-1.5 --model-type audio
\ No newline at end of file
diff --git a/doc/source/models/builtin/audio/index.rst b/doc/source/models/builtin/audio/index.rst
index 0936a93c02..f01047e13c 100644
--- a/doc/source/models/builtin/audio/index.rst
+++ b/doc/source/models/builtin/audio/index.rst
@@ -25,9 +25,13 @@ The following is a list of built-in audio models in Xinference:
   
    cosyvoice-300m-sft
   
+   cosyvoice2-0.5b
+  
    f5-tts
   
-   fishspeech-1.4
+   f5-tts-mlx
+  
+   fishspeech-1.5
   
    sensevoicesmall
   
diff --git a/doc/source/models/builtin/image/flux.1-dev.rst b/doc/source/models/builtin/image/flux.1-dev.rst
index 3a16cfe0a7..5018412d86 100644
--- a/doc/source/models/builtin/image/flux.1-dev.rst
+++ b/doc/source/models/builtin/image/flux.1-dev.rst
@@ -13,7 +13,15 @@ Specifications
 ^^^^^^^^^^^^^^
 
 - **Model ID:** black-forest-labs/FLUX.1-dev
+- **GGUF Model ID**: city96/FLUX.1-dev-gguf
+- **GGUF Quantizations**: F16, Q2_K, Q3_K_S, Q4_0, Q4_1, Q4_K_S, Q5_0, Q5_1, Q5_K_S, Q6_K, Q8_0
+
 
 Execute the following command to launch the model::
 
-   xinference launch --model-name FLUX.1-dev --model-type image
\ No newline at end of file
+   xinference launch --model-name FLUX.1-dev --model-type image
+
+
+For GGUF quantization, using below command:
+
+    xinference launch --model-name FLUX.1-dev --model-type image --gguf_quantization ${gguf_quantization} --cpu_offload True
diff --git a/doc/source/models/builtin/image/flux.1-schnell.rst b/doc/source/models/builtin/image/flux.1-schnell.rst
index df82d2069f..47bc1e2411 100644
--- a/doc/source/models/builtin/image/flux.1-schnell.rst
+++ b/doc/source/models/builtin/image/flux.1-schnell.rst
@@ -13,7 +13,15 @@ Specifications
 ^^^^^^^^^^^^^^
 
 - **Model ID:** black-forest-labs/FLUX.1-schnell
+- **GGUF Model ID**: city96/FLUX.1-schnell-gguf
+- **GGUF Quantizations**: F16, Q2_K, Q3_K_S, Q4_0, Q4_1, Q4_K_S, Q5_0, Q5_1, Q5_K_S, Q6_K, Q8_0
+
 
 Execute the following command to launch the model::
 
-   xinference launch --model-name FLUX.1-schnell --model-type image
\ No newline at end of file
+   xinference launch --model-name FLUX.1-schnell --model-type image
+
+
+For GGUF quantization, using below command:
+
+    xinference launch --model-name FLUX.1-schnell --model-type image --gguf_quantization ${gguf_quantization} --cpu_offload True
diff --git a/doc/source/models/builtin/image/got-ocr2_0.rst b/doc/source/models/builtin/image/got-ocr2_0.rst
index 994b0deae4..b1e9b1d5d2 100644
--- a/doc/source/models/builtin/image/got-ocr2_0.rst
+++ b/doc/source/models/builtin/image/got-ocr2_0.rst
@@ -16,4 +16,5 @@ Specifications
 
 Execute the following command to launch the model::
 
-   xinference launch --model-name GOT-OCR2_0 --model-type image
\ No newline at end of file
+   xinference launch --model-name GOT-OCR2_0 --model-type image
+
diff --git a/doc/source/models/builtin/image/index.rst b/doc/source/models/builtin/image/index.rst
index bf4efdab86..dacc0cbe38 100644
--- a/doc/source/models/builtin/image/index.rst
+++ b/doc/source/models/builtin/image/index.rst
@@ -23,6 +23,12 @@ The following is a list of built-in image models in Xinference:
   
    sd3-medium
   
+   sd3.5-large
+  
+   sd3.5-large-turbo
+  
+   sd3.5-medium
+  
    sdxl-turbo
   
    stable-diffusion-2-inpainting
diff --git a/doc/source/models/builtin/image/kolors.rst b/doc/source/models/builtin/image/kolors.rst
index 19d11c4201..886de8ff02 100644
--- a/doc/source/models/builtin/image/kolors.rst
+++ b/doc/source/models/builtin/image/kolors.rst
@@ -16,4 +16,5 @@ Specifications
 
 Execute the following command to launch the model::
 
-   xinference launch --model-name kolors --model-type image
\ No newline at end of file
+   xinference launch --model-name kolors --model-type image
+
diff --git a/doc/source/models/builtin/image/sd-turbo.rst b/doc/source/models/builtin/image/sd-turbo.rst
index e799b423a3..c78ebaef91 100644
--- a/doc/source/models/builtin/image/sd-turbo.rst
+++ b/doc/source/models/builtin/image/sd-turbo.rst
@@ -16,4 +16,5 @@ Specifications
 
 Execute the following command to launch the model::
 
-   xinference launch --model-name sd-turbo --model-type image
\ No newline at end of file
+   xinference launch --model-name sd-turbo --model-type image
+
diff --git a/doc/source/models/builtin/image/sd3-medium.rst b/doc/source/models/builtin/image/sd3-medium.rst
index 953a3eca32..f0e728cc02 100644
--- a/doc/source/models/builtin/image/sd3-medium.rst
+++ b/doc/source/models/builtin/image/sd3-medium.rst
@@ -16,4 +16,5 @@ Specifications
 
 Execute the following command to launch the model::
 
-   xinference launch --model-name sd3-medium --model-type image
\ No newline at end of file
+   xinference launch --model-name sd3-medium --model-type image
+
diff --git a/doc/source/models/builtin/image/sd3.5-large-turbo.rst b/doc/source/models/builtin/image/sd3.5-large-turbo.rst
new file mode 100644
index 0000000000..3c7e26c2b0
--- /dev/null
+++ b/doc/source/models/builtin/image/sd3.5-large-turbo.rst
@@ -0,0 +1,27 @@
+.. _models_builtin_sd3.5-large-turbo:
+
+=================
+sd3.5-large-turbo
+=================
+
+- **Model Name:** sd3.5-large-turbo
+- **Model Family:** stable_diffusion
+- **Abilities:** text2image, image2image, inpainting
+- **Available ControlNet:** None
+
+Specifications
+^^^^^^^^^^^^^^
+
+- **Model ID:** stabilityai/stable-diffusion-3.5-large-turbo
+- **GGUF Model ID**: city96/stable-diffusion-3.5-large-turbo-gguf
+- **GGUF Quantizations**: F16, Q4_0, Q4_1, Q5_0, Q5_1, Q8_0
+
+
+Execute the following command to launch the model::
+
+   xinference launch --model-name sd3.5-large-turbo --model-type image
+
+
+For GGUF quantization, using below command:
+
+    xinference launch --model-name sd3.5-large-turbo --model-type image --gguf_quantization ${gguf_quantization} --cpu_offload True
diff --git a/doc/source/models/builtin/image/sd3.5-large.rst b/doc/source/models/builtin/image/sd3.5-large.rst
new file mode 100644
index 0000000000..d998fc4908
--- /dev/null
+++ b/doc/source/models/builtin/image/sd3.5-large.rst
@@ -0,0 +1,27 @@
+.. _models_builtin_sd3.5-large:
+
+===========
+sd3.5-large
+===========
+
+- **Model Name:** sd3.5-large
+- **Model Family:** stable_diffusion
+- **Abilities:** text2image, image2image, inpainting
+- **Available ControlNet:** None
+
+Specifications
+^^^^^^^^^^^^^^
+
+- **Model ID:** stabilityai/stable-diffusion-3.5-large
+- **GGUF Model ID**: city96/stable-diffusion-3.5-large-gguf
+- **GGUF Quantizations**: F16, Q4_0, Q4_1, Q5_0, Q5_1, Q8_0
+
+
+Execute the following command to launch the model::
+
+   xinference launch --model-name sd3.5-large --model-type image
+
+
+For GGUF quantization, using below command:
+
+    xinference launch --model-name sd3.5-large --model-type image --gguf_quantization ${gguf_quantization} --cpu_offload True
diff --git a/doc/source/models/builtin/image/sd3.5-medium.rst b/doc/source/models/builtin/image/sd3.5-medium.rst
new file mode 100644
index 0000000000..42a65f7e47
--- /dev/null
+++ b/doc/source/models/builtin/image/sd3.5-medium.rst
@@ -0,0 +1,27 @@
+.. _models_builtin_sd3.5-medium:
+
+============
+sd3.5-medium
+============
+
+- **Model Name:** sd3.5-medium
+- **Model Family:** stable_diffusion
+- **Abilities:** text2image, image2image, inpainting
+- **Available ControlNet:** None
+
+Specifications
+^^^^^^^^^^^^^^
+
+- **Model ID:** stabilityai/stable-diffusion-3.5-medium
+- **GGUF Model ID**: city96/stable-diffusion-3.5-medium-gguf
+- **GGUF Quantizations**: F16, Q3_K_M, Q3_K_S, Q4_0, Q4_1, Q4_K_M, Q4_K_S, Q5_0, Q5_1, Q5_K_M, Q5_K_S, Q6_K, Q8_0
+
+
+Execute the following command to launch the model::
+
+   xinference launch --model-name sd3.5-medium --model-type image
+
+
+For GGUF quantization, using below command:
+
+    xinference launch --model-name sd3.5-medium --model-type image --gguf_quantization ${gguf_quantization} --cpu_offload True
diff --git a/doc/source/models/builtin/image/sdxl-turbo.rst b/doc/source/models/builtin/image/sdxl-turbo.rst
index 878b10079f..9da2b63bf2 100644
--- a/doc/source/models/builtin/image/sdxl-turbo.rst
+++ b/doc/source/models/builtin/image/sdxl-turbo.rst
@@ -16,4 +16,5 @@ Specifications
 
 Execute the following command to launch the model::
 
-   xinference launch --model-name sdxl-turbo --model-type image
\ No newline at end of file
+   xinference launch --model-name sdxl-turbo --model-type image
+
diff --git a/doc/source/models/builtin/image/stable-diffusion-2-inpainting.rst b/doc/source/models/builtin/image/stable-diffusion-2-inpainting.rst
index 6009cd37a3..01a84a63db 100644
--- a/doc/source/models/builtin/image/stable-diffusion-2-inpainting.rst
+++ b/doc/source/models/builtin/image/stable-diffusion-2-inpainting.rst
@@ -16,4 +16,5 @@ Specifications
 
 Execute the following command to launch the model::
 
-   xinference launch --model-name stable-diffusion-2-inpainting --model-type image
\ No newline at end of file
+   xinference launch --model-name stable-diffusion-2-inpainting --model-type image
+
diff --git a/doc/source/models/builtin/image/stable-diffusion-inpainting.rst b/doc/source/models/builtin/image/stable-diffusion-inpainting.rst
index 76f1274048..3b4832471c 100644
--- a/doc/source/models/builtin/image/stable-diffusion-inpainting.rst
+++ b/doc/source/models/builtin/image/stable-diffusion-inpainting.rst
@@ -16,4 +16,5 @@ Specifications
 
 Execute the following command to launch the model::
 
-   xinference launch --model-name stable-diffusion-inpainting --model-type image
\ No newline at end of file
+   xinference launch --model-name stable-diffusion-inpainting --model-type image
+
diff --git a/doc/source/models/builtin/image/stable-diffusion-v1.5.rst b/doc/source/models/builtin/image/stable-diffusion-v1.5.rst
index 5a0c73adcd..764bbb0a4c 100644
--- a/doc/source/models/builtin/image/stable-diffusion-v1.5.rst
+++ b/doc/source/models/builtin/image/stable-diffusion-v1.5.rst
@@ -16,4 +16,5 @@ Specifications
 
 Execute the following command to launch the model::
 
-   xinference launch --model-name stable-diffusion-v1.5 --model-type image
\ No newline at end of file
+   xinference launch --model-name stable-diffusion-v1.5 --model-type image
+
diff --git a/doc/source/models/builtin/image/stable-diffusion-xl-base-1.0.rst b/doc/source/models/builtin/image/stable-diffusion-xl-base-1.0.rst
index a4f7518dbf..cfb515be55 100644
--- a/doc/source/models/builtin/image/stable-diffusion-xl-base-1.0.rst
+++ b/doc/source/models/builtin/image/stable-diffusion-xl-base-1.0.rst
@@ -16,4 +16,5 @@ Specifications
 
 Execute the following command to launch the model::
 
-   xinference launch --model-name stable-diffusion-xl-base-1.0 --model-type image
\ No newline at end of file
+   xinference launch --model-name stable-diffusion-xl-base-1.0 --model-type image
+
diff --git a/doc/source/models/builtin/image/stable-diffusion-xl-inpainting.rst b/doc/source/models/builtin/image/stable-diffusion-xl-inpainting.rst
index 61a72cc044..f096627d51 100644
--- a/doc/source/models/builtin/image/stable-diffusion-xl-inpainting.rst
+++ b/doc/source/models/builtin/image/stable-diffusion-xl-inpainting.rst
@@ -16,4 +16,5 @@ Specifications
 
 Execute the following command to launch the model::
 
-   xinference launch --model-name stable-diffusion-xl-inpainting --model-type image
\ No newline at end of file
+   xinference launch --model-name stable-diffusion-xl-inpainting --model-type image
+
diff --git a/doc/source/models/model_abilities/image.rst b/doc/source/models/model_abilities/image.rst
index 79834e0dca..e49971a85b 100644
--- a/doc/source/models/model_abilities/image.rst
+++ b/doc/source/models/model_abilities/image.rst
@@ -1,4 +1,4 @@
-.. _image:
+ .. _image:
 
 ======
 Images
@@ -41,6 +41,9 @@ The Text-to-image API is supported with the following models in Xinference:
 * stable-diffusion-v1.5
 * stable-diffusion-xl-base-1.0
 * sd3-medium
+* sd3.5-medium
+* sd3.5-large
+* sd3.5-large-turbo
 * FLUX.1-schnell
 * FLUX.1-dev
 
@@ -105,8 +108,15 @@ We can try Text-to-image API out either via cURL, OpenAI Client, or Xinference's
     }
 
 
-Tips for Large Image Models including SD3-Medium, FLUX.1
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Quantize Large Image Models e.g. SD3-Medium, FLUX.1
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. note::
+
+    From v0.16.1, Xinference by default enabled quantization for
+    large image models like Flux.1 and SD3.5 series.
+    So if your Xinference version is newer than v0.16.1,
+    You barely need to do anything to run those large image models on GPUs with small memory.
 
 Useful extra parameters can be passed to launch including:
 
@@ -120,29 +130,77 @@ Useful extra parameters can be passed to launch including:
 * ``--text_encoder_3 None``, for sd3-medium, removing the memory-intensive 4.7B parameter
   T5-XXL text encoder during inference can significantly decrease the memory requirements
   with only a slight loss in performance.
+* ``--transformer_nf4 True``: use nf4 for transformer quantization.
+* ``--quantize``: Only work for MLX on Mac, Flux.1-dev and Flux.1-schnell will switch to
+  MLX engine on Mac, and ``quantize`` can be used to quantize the model.
+
+For WebUI, Just add additional parameters, e.g. add key ``cpu_offload`` and value ``True``
+to enable cpu offloading.
+
+Below list default options that used from v0.16.1.
+
++-------------------+-----------------------+----------------------+------------------+
+| Model             | quantize_text_encoder | quantize             | transformer_nf4  |
++===================+=======================+======================+==================+
+| FLUX.1-dev        | text_encoder_2        | True                 | False            |
++-------------------+-----------------------+----------------------+------------------+
+| FLUX.1-schnell    | text_encoder_2        | True                 | False            |
++-------------------+-----------------------+----------------------+------------------+
+| sd3-medium        | text_encoder_3        | N/A                  | False            |
++-------------------+-----------------------+----------------------+------------------+
+| sd3.5-medium      | text_encoder_3        | N/A                  | False            |
++-------------------+-----------------------+----------------------+------------------+
+| sd3.5-large       | text_encoder_3        | N/A                  | True             |
++-------------------+-----------------------+----------------------+------------------+
+| sd3.5-large-turbo | text_encoder_3        | N/A                  | True             |
++-------------------+-----------------------+----------------------+------------------+
+
+.. note::
+
+    If you want to disable some quantization, just set the corresponding option to False.
+    e.g. for Web UI, set key ``quantize_text_encoder`` and value ``False``
+    and for command line, specify ``--quantize_text_encoder False`` to disable quantization
+    for text encoder.
+
+GGUF file format
+~~~~~~~~~~~~~~~~
+
+GGUF file format for transformer provides various quantization options.
+To use gguf file, you can specify additional option ``gguf_quantization`` for web UI,
+or ``--gguf_quantization`` for command line for those image models which support
+internally by Xinference. Below is the mode list.
+
++-------------------+------------------------------------------------------------------------------------------+
+| Model             | supported gguf quantization                                                              |
++===================+==============================================+===========================================+
+| FLUX.1-dev        | F16, Q2_K, Q3_K_S, Q4_0, Q4_1, Q4_K_S, Q5_0, Q5_1, Q5_K_S, Q6_K, Q8_0                    |
++-------------------+------------------------------------------------------------------------------------------+
+| FLUX.1-schnell    | F16, Q2_K, Q3_K_S, Q4_0, Q4_1, Q4_K_S, Q5_0, Q5_1, Q5_K_S, Q6_K, Q8_0                    |
++-------------------+------------------------------------------------------------------------------------------+
+| sd3.5-medium      | F16, Q3_K_M, Q3_K_S, Q4_0, Q4_1, Q4_K_M, Q4_K_S, Q5_0, Q5_1, Q5_K_M, Q5_K_S, Q6_K, Q8_0  |
++-------------------+------------------------------------------------------------------------------------------+
+| sd3.5-large       | F16, Q4_0, Q4_1, Q5_0, Q5_1, Q8_0                                                        |
++-------------------+------------------------------------------------------------------------------------------+
+| sd3.5-large-turbo | F16, Q4_0, Q4_1, Q5_0, Q5_1, Q8_0                                                        |
++-------------------+------------------------------------------------------------------------------------------+
+
+.. note::
+
+    We stronly recommend to enable additional option ``cpu_offload`` with value ``True`` for WebUI,
+    or specify ``--cpu_offload True`` for command line.
+
+Example:
+
+.. code-block::
+
+    xinference launch --model-name FLUX.1-dev --model-type image --gguf_quantization Q2_K --cpu_offload True
+
+With ``Q2_K`` quantization, you only need around 5 GiB GPU memory to run Flux.1-dev.
+
+For those models gguf options are not supported internally, or you want to download gguf files on you own,
+you can specify additional option ``gguf_model_path`` for web UI or spcecify
+``--gguf_model_path /path/to/model_quant.gguf`` for command line.
 
-If you are trying to run large image models liek sd3-medium or FLUX.1 series on GPU card
-that has less memory than 24GB, you may encounter OOM when launching or inference.
-Try below solutions.
-
-For FLUX.1 series, try to apply quantization.
-
-.. code:: bash
-
-    xinference launch --model-name FLUX.1-dev --model-type image --quantize_text_encoder text_encoder_2
-
-For sd3-medium, apply quantization to ``text_encoder_3``.
-
-.. code:: bash
-
-    xinference launch --model-name sd3-medium --model-type image --quantize_text_encoder text_encoder_3
-
-
-Or removing memory-intensive T5-XXL text encoder for sd3-medium.
-
-.. code:: bash
-
-    xinference launch --model-name sd3-medium --model-type image --text_encoder_3 None
 
 Image-to-image
 --------------------
diff --git a/doc/templates/image.rst.jinja b/doc/templates/image.rst.jinja
index 06379d7d56..8e1e57c145 100644
--- a/doc/templates/image.rst.jinja
+++ b/doc/templates/image.rst.jinja
@@ -13,7 +13,17 @@ Specifications
 ^^^^^^^^^^^^^^
 
 - **Model ID:** {{ model_id }}
+{%- if gguf_quantizations %}
+- **GGUF Model ID**: {{ gguf_model_id }}
+- **GGUF Quantizations**: {{ gguf_quantizations }}
+{% endif %}
 
 Execute the following command to launch the model::
 
-   xinference launch --model-name {{ model_name }} --model-type image
\ No newline at end of file
+   xinference launch --model-name {{ model_name }} --model-type image
+
+{% if gguf_quantizations %}
+For GGUF quantization, using below command:
+
+    xinference launch --model-name {{ model_name }} --model-type image --gguf_quantization ${{ '{' }}gguf_quantization{{ '}' }} --cpu_offload True
+{% endif %}
\ No newline at end of file
diff --git a/setup.cfg b/setup.cfg
index f21d2431b4..7cafa63ce6 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -90,7 +90,6 @@ all =
     tiktoken>=0.6.0
     sentence-transformers>=3.1.0
     vllm>=0.2.6 ; sys_platform=='linux'
-    diffusers>=0.30.0
     imageio-ffmpeg  # For video
     controlnet_aux
     orjson
@@ -119,7 +118,8 @@ all =
     hydra-core>=1.3.2  # For CosyVoice, matcha
     inflect  # For CosyVoice, matcha
     conformer  # For CosyVoice, matcha
-    diffusers>=0.30.0  # For CosyVoice, matcha
+    diffusers>=0.32.0  # For CosyVoice, matcha
+    gguf  # For image
     gdown  # For CosyVoice, matcha
     pyarrow  # For CosyVoice, matcha
     HyperPyYAML  # For CosyVoice
@@ -193,9 +193,10 @@ embedding =
 rerank =
     FlagEmbedding
 image =
-    diffusers>=0.30.0  # fix conflict with matcha-tts
+    diffusers>=0.32.0  # fix conflict with matcha-tts
     controlnet_aux
     deepcache
+    gguf
     verovio>=4.3.1  # For got_ocr2
     transformers>=4.37.2  # For got_ocr2
     tiktoken>=0.6.0  # For got_ocr2
@@ -203,7 +204,7 @@ image =
     torch  # For got_ocr2
     torchvision  # For got_ocr2
 video =
-    diffusers>=0.30.0
+    diffusers>=0.32.0
     imageio-ffmpeg
 audio =
     funasr<1.1.17
@@ -220,7 +221,7 @@ audio =
     hydra-core>=1.3.2  # For CosyVoice, matcha
     inflect  # For CosyVoice, matcha
     conformer  # For CosyVoice, matcha
-    diffusers>=0.30.0  # For CosyVoice, matcha
+    diffusers>=0.32.0  # For CosyVoice, matcha
     gdown  # For CosyVoice, matcha
     pyarrow  # For CosyVoice, matcha
     HyperPyYAML  # For CosyVoice
diff --git a/xinference/core/utils.py b/xinference/core/utils.py
index 278c570b20..6f9470107c 100644
--- a/xinference/core/utils.py
+++ b/xinference/core/utils.py
@@ -62,12 +62,16 @@ def decorator(func):
 
         @wraps(func)
         async def wrapped(*args, **kwargs):
-            try:
-                bound_args = sig.bind_partial(*args, **kwargs)
-                arguments = bound_args.arguments
-            except TypeError:
-                arguments = {}
-            request_id_str = arguments.get("request_id", "")
+            request_id_str = kwargs.get("request_id")
+            if not request_id_str:
+                # sometimes `request_id` not in kwargs
+                # we try to bind the arguments
+                try:
+                    bound_args = sig.bind_partial(*args, **kwargs)
+                    arguments = bound_args.arguments
+                except TypeError:
+                    arguments = {}
+                request_id_str = arguments.get("request_id", "")
             if not request_id_str:
                 request_id_str = uuid.uuid1()
                 if func_name == "text_to_image":
diff --git a/xinference/deploy/docker/requirements.txt b/xinference/deploy/docker/requirements.txt
index 4c080714f9..b07688898b 100644
--- a/xinference/deploy/docker/requirements.txt
+++ b/xinference/deploy/docker/requirements.txt
@@ -34,7 +34,6 @@ protobuf
 einops
 tiktoken>=0.6.0
 sentence-transformers>=3.1.0
-diffusers>=0.30.0
 controlnet_aux
 orjson
 auto-gptq
@@ -57,7 +56,7 @@ lightning>=2.0.0  # For CosyVoice, matcha
 hydra-core>=1.3.2  # For CosyVoice, matcha
 inflect  # For CosyVoice, matcha
 conformer  # For CosyVoice, matcha
-diffusers>=0.30.0  # For CosyVoice, matcha
+diffusers>=0.32.0  # For CosyVoice, matcha
 gdown  # For CosyVoice, matcha
 pyarrow  # For CosyVoice, matcha
 HyperPyYAML  # For CosyVoice
@@ -88,6 +87,7 @@ datamodel_code_generator # for minicpm-4B
 jsonschema # for minicpm-4B
 deepcache # for sd
 verovio>=4.3.1  # For got_ocr2
+gguf
 
 # sglang
 decord
diff --git a/xinference/deploy/docker/requirements_cpu.txt b/xinference/deploy/docker/requirements_cpu.txt
index 5465e7e1da..e89475bf70 100644
--- a/xinference/deploy/docker/requirements_cpu.txt
+++ b/xinference/deploy/docker/requirements_cpu.txt
@@ -31,7 +31,6 @@ einops
 tiktoken
 sentence-transformers>=3.1.0
 FlagEmbedding
-diffusers>=0.30.0
 controlnet_aux
 orjson
 auto-gptq
@@ -54,7 +53,7 @@ lightning>=2.0.0  # For CosyVoice, matcha
 hydra-core>=1.3.2  # For CosyVoice, matcha
 inflect  # For CosyVoice, matcha
 conformer  # For CosyVoice, matcha
-diffusers>=0.30.0  # For CosyVoice, matcha
+diffusers>=0.32.0  # For CosyVoice, matcha
 gdown  # For CosyVoice, matcha
 pyarrow  # For CosyVoice, matcha
 HyperPyYAML  # For CosyVoice
@@ -82,3 +81,4 @@ qwen-vl-utils # For qwen2-vl
 datamodel_code_generator # for minicpm-4B
 jsonschema # for minicpm-4B
 verovio>=4.3.1  # For got_ocr2
+gguf
diff --git a/xinference/model/image/core.py b/xinference/model/image/core.py
index 432a70c1a4..cfaa11070a 100644
--- a/xinference/model/image/core.py
+++ b/xinference/model/image/core.py
@@ -22,7 +22,12 @@
 from ...constants import XINFERENCE_CACHE_DIR
 from ...types import PeftModelConfig
 from ..core import CacheableModelSpec, ModelDescription
-from ..utils import valid_model_revision
+from ..utils import (
+    IS_NEW_HUGGINGFACE_HUB,
+    retry_download,
+    symlink_local_file,
+    valid_model_revision,
+)
 from .ocr.got_ocr2 import GotOCR2Model
 from .stable_diffusion.core import DiffusionModel
 from .stable_diffusion.mlx import MLXDiffusionModel
@@ -51,6 +56,9 @@ class ImageModelFamilyV1(CacheableModelSpec):
     controlnet: Optional[List["ImageModelFamilyV1"]]
     default_model_config: Optional[dict] = {}
     default_generate_config: Optional[dict] = {}
+    gguf_model_id: Optional[str]
+    gguf_quantizations: Optional[List[str]]
+    gguf_model_file_name_template: Optional[str]
 
 
 class ImageModelDescription(ModelDescription):
@@ -187,6 +195,61 @@ def get_cache_status(
         return valid_model_revision(meta_path, model_spec.model_revision)
 
 
+def cache_gguf(spec: ImageModelFamilyV1, quantization: Optional[str] = None):
+    if not quantization:
+        return
+
+    cache_dir = os.path.realpath(os.path.join(XINFERENCE_CACHE_DIR, spec.model_name))
+    if not os.path.exists(cache_dir):
+        os.makedirs(cache_dir, exist_ok=True)
+
+    if not spec.gguf_model_file_name_template:
+        raise NotImplementedError(
+            f"{spec.model_name} does not support GGUF quantization"
+        )
+    if quantization not in (spec.gguf_quantizations or []):
+        raise ValueError(
+            f"Cannot support quantization {quantization}, "
+            f"available quantizations: {spec.gguf_quantizations}"
+        )
+
+    filename = spec.gguf_model_file_name_template.format(quantization=quantization)  # type: ignore
+    full_path = os.path.join(cache_dir, filename)
+
+    if spec.model_hub == "huggingface":
+        import huggingface_hub
+
+        use_symlinks = {}
+        if not IS_NEW_HUGGINGFACE_HUB:
+            use_symlinks = {"local_dir_use_symlinks": True, "local_dir": cache_dir}
+        download_file_path = retry_download(
+            huggingface_hub.hf_hub_download,
+            spec.model_name,
+            None,
+            spec.gguf_model_id,
+            filename=filename,
+            **use_symlinks,
+        )
+        if IS_NEW_HUGGINGFACE_HUB:
+            symlink_local_file(download_file_path, cache_dir, filename)
+    elif spec.model_hub == "modelscope":
+        from modelscope.hub.file_download import model_file_download
+
+        download_file_path = retry_download(
+            model_file_download,
+            spec.model_name,
+            None,
+            spec.gguf_model_id,
+            filename,
+            revision=spec.model_revision,
+        )
+        symlink_local_file(download_file_path, cache_dir, filename)
+    else:
+        raise NotImplementedError
+
+    return full_path
+
+
 def create_ocr_model_instance(
     subpool_addr: str,
     devices: List[str],
@@ -219,6 +282,8 @@ def create_image_model_instance(
         Literal["huggingface", "modelscope", "openmind_hub", "csghub"]
     ] = None,
     model_path: Optional[str] = None,
+    gguf_quantization: Optional[str] = None,
+    gguf_model_path: Optional[str] = None,
     **kwargs,
 ) -> Tuple[
     Union[DiffusionModel, MLXDiffusionModel, GotOCR2Model], ImageModelDescription
@@ -272,6 +337,8 @@ def create_image_model_instance(
             ]
     if not model_path:
         model_path = cache(model_spec)
+    if not gguf_model_path and gguf_quantization:
+        gguf_model_path = cache_gguf(model_spec, gguf_quantization)
     if peft_model_config is not None:
         lora_model = peft_model_config.peft_model
         lora_load_kwargs = peft_model_config.image_lora_load_kwargs
@@ -298,6 +365,7 @@ def create_image_model_instance(
         lora_load_kwargs=lora_load_kwargs,
         lora_fuse_kwargs=lora_fuse_kwargs,
         model_spec=model_spec,
+        gguf_model_path=gguf_model_path,
         **kwargs,
     )
     model_description = ImageModelDescription(
diff --git a/xinference/model/image/model_spec.json b/xinference/model/image/model_spec.json
index 24933cb99e..9d0b071a34 100644
--- a/xinference/model/image/model_spec.json
+++ b/xinference/model/image/model_spec.json
@@ -11,8 +11,24 @@
     ],
     "default_model_config": {
       "quantize": true,
-      "quantize_text_encoder": "text_encoder_2"
-    }
+      "quantize_text_encoder": "text_encoder_2",
+      "torch_dtype": "bfloat16"
+    },
+    "gguf_model_id": "city96/FLUX.1-schnell-gguf",
+    "gguf_quantizations": [
+      "F16",
+      "Q2_K",
+      "Q3_K_S",
+      "Q4_0",
+      "Q4_1",
+      "Q4_K_S",
+      "Q5_0",
+      "Q5_1",
+      "Q5_K_S",
+      "Q6_K",
+      "Q8_0"
+    ],
+    "gguf_model_file_name_template": "flux1-schnell-{quantization}.gguf"
   },
   {
     "model_name": "FLUX.1-dev",
@@ -26,8 +42,24 @@
     ],
     "default_model_config": {
       "quantize": true,
-      "quantize_text_encoder": "text_encoder_2"
-    }
+      "quantize_text_encoder": "text_encoder_2",
+      "torch_dtype": "bfloat16"
+    },
+    "gguf_model_id": "city96/FLUX.1-dev-gguf",
+    "gguf_quantizations": [
+      "F16",
+      "Q2_K",
+      "Q3_K_S",
+      "Q4_0",
+      "Q4_1",
+      "Q4_K_S",
+      "Q5_0",
+      "Q5_1",
+      "Q5_K_S",
+      "Q6_K",
+      "Q8_0"
+    ],
+    "gguf_model_file_name_template": "flux1-dev-{quantization}.gguf"
   },
   {
     "model_name": "sd3-medium",
@@ -44,6 +76,97 @@
       "quantize_text_encoder": "text_encoder_3"
     }
   },
+  {
+    "model_name": "sd3.5-medium",
+    "model_family": "stable_diffusion",
+    "model_id": "stabilityai/stable-diffusion-3.5-medium",
+    "model_revision": "94b13ccbe959c51e8159d91f562c58f29fac971a",
+    "model_ability": [
+      "text2image",
+      "image2image",
+      "inpainting"
+    ],
+    "default_model_config": {
+      "quantize": true,
+      "quantize_text_encoder": "text_encoder_3",
+      "torch_dtype": "bfloat16"
+    },
+    "gguf_model_id": "city96/stable-diffusion-3.5-medium-gguf",
+    "gguf_quantizations": [
+      "F16",
+      "Q3_K_M",
+      "Q3_K_S",
+      "Q4_0",
+      "Q4_1",
+      "Q4_K_M",
+      "Q4_K_S",
+      "Q5_0",
+      "Q5_1",
+      "Q5_K_M",
+      "Q5_K_S",
+      "Q6_K",
+      "Q8_0"
+    ],
+    "gguf_model_file_name_template": "sd3.5_medium-{quantization}.gguf"
+  },
+  {
+    "model_name": "sd3.5-large",
+    "model_family": "stable_diffusion",
+    "model_id": "stabilityai/stable-diffusion-3.5-large",
+    "model_revision": "ceddf0a7fdf2064ea28e2213e3b84e4afa170a0f",
+    "model_ability": [
+      "text2image",
+      "image2image",
+      "inpainting"
+    ],
+    "default_model_config": {
+      "quantize": true,
+      "quantize_text_encoder": "text_encoder_3",
+      "torch_dtype": "bfloat16",
+      "transformer_nf4": true
+    },
+    "gguf_model_id": "city96/stable-diffusion-3.5-large-gguf",
+    "gguf_quantizations": [
+      "F16",
+      "Q4_0",
+      "Q4_1",
+      "Q5_0",
+      "Q5_1",
+      "Q8_0"
+    ],
+    "gguf_model_file_name_template": "sd3.5_large-{quantization}.gguf"
+  },
+  {
+    "model_name": "sd3.5-large-turbo",
+    "model_family": "stable_diffusion",
+    "model_id": "stabilityai/stable-diffusion-3.5-large-turbo",
+    "model_revision": "ec07796fc06b096cc56de9762974a28f4c632eda",
+    "model_ability": [
+      "text2image",
+      "image2image",
+      "inpainting"
+    ],
+    "default_model_config": {
+      "quantize": true,
+      "quantize_text_encoder": "text_encoder_3",
+      "torch_dtype": "bfloat16",
+      "transformer_nf4": true
+    },
+    "default_generate_config": {
+      "guidance_scale": 1.0,
+      "num_inference_steps": 4
+    },
+    "gguf_model_id": "city96/stable-diffusion-3.5-large-turbo-gguf",
+    "gguf_quantizations": [
+      "F16",
+      "Q4_0",
+      "Q4_1",
+      "Q5_0",
+      "Q5_1",
+      "Q8_0"
+    ],
+    "gguf_model_file_name_template": "sd3.5_large_turbo-{quantization}.gguf"
+  },
   {
     "model_name": "sd-turbo",
     "model_family": "stable_diffusion",
diff --git a/xinference/model/image/model_spec_modelscope.json b/xinference/model/image/model_spec_modelscope.json
index ad8af7a26f..391749161c 100644
--- a/xinference/model/image/model_spec_modelscope.json
+++ b/xinference/model/image/model_spec_modelscope.json
@@ -12,8 +12,24 @@
     ],
     "default_model_config": {
       "quantize": true,
-      "quantize_text_encoder": "text_encoder_2"
-    }
+      "quantize_text_encoder": "text_encoder_2",
+      "torch_dtype": "bfloat16"
+    },
+    "gguf_model_id": "Xorbits/FLUX.1-schnell-gguf",
+    "gguf_quantizations": [
+      "F16",
+      "Q2_K",
+      "Q3_K_S",
+      "Q4_0",
+      "Q4_1",
+      "Q4_K_S",
+      "Q5_0",
+      "Q5_1",
+      "Q5_K_S",
+      "Q6_K",
+      "Q8_0"
+    ],
+    "gguf_model_file_name_template": "flux1-schnell-{quantization}.gguf"
   },
   {
     "model_name": "FLUX.1-dev",
@@ -28,8 +44,24 @@
     ],
     "default_model_config": {
       "quantize": true,
-      "quantize_text_encoder": "text_encoder_2"
-    }
+      "quantize_text_encoder": "text_encoder_2",
+      "torch_dtype": "bfloat16"
+    },
+    "gguf_model_id": "AI-ModelScope/FLUX.1-dev-gguf",
+    "gguf_quantizations": [
+      "F16",
+      "Q2_K",
+      "Q3_K_S",
+      "Q4_0",
+      "Q4_1",
+      "Q4_K_S",
+      "Q5_0",
+      "Q5_1",
+      "Q5_K_S",
+      "Q6_K",
+      "Q8_0"
+    ],
+    "gguf_model_file_name_template": "flux1-dev-{quantization}.gguf"
   },
   {
     "model_name": "sd3-medium",
@@ -47,6 +79,100 @@
       "quantize_text_encoder": "text_encoder_3"
     }
   },
+  {
+    "model_name": "sd3.5-medium",
+    "model_family": "stable_diffusion",
+    "model_hub": "modelscope",
+    "model_id": "AI-ModelScope/stable-diffusion-3.5-medium",
+    "model_revision": "master",
+    "model_ability": [
+      "text2image",
+      "image2image",
+      "inpainting"
+    ],
+    "default_model_config": {
+      "quantize": true,
+      "quantize_text_encoder": "text_encoder_3",
+      "torch_dtype": "bfloat16"
+    },
+    "gguf_model_id": "Xorbits/stable-diffusion-3.5-medium-gguf",
+    "gguf_quantizations": [
+      "F16",
+      "Q3_K_M",
+      "Q3_K_S",
+      "Q4_0",
+      "Q4_1",
+      "Q4_K_M",
+      "Q4_K_S",
+      "Q5_0",
+      "Q5_1",
+      "Q5_K_M",
+      "Q5_K_S",
+      "Q6_K",
+      "Q8_0"
+    ],
+    "gguf_model_file_name_template": "sd3.5_medium-{quantization}.gguf"
+  },
+  {
+    "model_name": "sd3.5-large",
+    "model_family": "stable_diffusion",
+    "model_hub": "modelscope",
+    "model_id": "AI-ModelScope/stable-diffusion-3.5-large",
+    "model_revision": "master",
+    "model_ability": [
+      "text2image",
+      "image2image",
+      "inpainting"
+    ],
+    "default_model_config": {
+      "quantize": true,
+      "quantize_text_encoder": "text_encoder_3",
+      "torch_dtype": "bfloat16",
+      "transformer_nf4": true
+    },
+    "gguf_model_id": "Xorbits/stable-diffusion-3.5-large-gguf",
+    "gguf_quantizations": [
+      "F16",
+      "Q4_0",
+      "Q4_1",
+      "Q5_0",
+      "Q5_1",
+      "Q8_0"
+    ],
+    "gguf_model_file_name_template": "sd3.5_large-{quantization}.gguf"
+  },
+  {
+    "model_name": "sd3.5-large-turbo",
+    "model_family": "stable_diffusion",
+    "model_hub": "modelscope",
+    "model_id": "AI-ModelScope/stable-diffusion-3.5-large-turbo",
+    "model_revision": "master",
+    "model_ability": [
+      "text2image",
+      "image2image",
+      "inpainting"
+    ],
+    "default_model_config": {
+      "quantize": true,
+      "quantize_text_encoder": "text_encoder_3",
+      "torch_dtype": "bfloat16",
+      "transformer_nf4": true
+    },
+    "default_generate_config": {
+      "guidance_scale": 1.0,
+      "num_inference_steps": 4
+    },
+    "gguf_model_id": "Xorbits/stable-diffusion-3.5-large-turbo-gguf",
+    "gguf_quantizations": [
+      "F16",
+      "Q4_0",
+      "Q4_1",
+      "Q5_0",
+      "Q5_1",
+      "Q8_0"
+    ],
+    "gguf_model_file_name_template": "sd3.5_large_turbo-{quantization}.gguf"
+  },
   {
     "model_name": "sd-turbo",
     "model_family": "stable_diffusion",
diff --git a/xinference/model/image/stable_diffusion/core.py b/xinference/model/image/stable_diffusion/core.py
index e0f7e5c886..3330fd9395 100644
--- a/xinference/model/image/stable_diffusion/core.py
+++ b/xinference/model/image/stable_diffusion/core.py
@@ -14,8 +14,10 @@
 
 import contextlib
 import gc
+import importlib
 import inspect
 import itertools
+import json
 import logging
 import os
 import re
@@ -86,6 +88,7 @@ def __init__(
         lora_load_kwargs: Optional[Dict] = None,
         lora_fuse_kwargs: Optional[Dict] = None,
         model_spec: Optional["ImageModelFamilyV1"] = None,
+        gguf_model_path: Optional[str] = None,
         **kwargs,
     ):
         self._model_uid = model_uid
@@ -109,6 +112,8 @@ def __init__(
         self._model_spec = model_spec
         self._abilities = model_spec.model_ability or []  # type: ignore
         self._kwargs = kwargs
+        # gguf
+        self._gguf_model_path = gguf_model_path
 
     @property
     def model_ability(self):
@@ -184,7 +189,17 @@ def _apply_lora(self):
             self._model.fuse_lora(**self._lora_fuse_kwargs)
             logger.info(f"Successfully loaded the LoRA for model {self._model_uid}.")
 
+    def _get_layer_cls(self, layer: str):
+        with open(os.path.join(self._model_path, "model_index.json")) as f:  # type: ignore
+            model_index = json.load(f)
+            layer_info = model_index[layer]
+            module_name, class_name = layer_info
+            module = importlib.import_module(module_name)
+            return getattr(module, class_name)
+
     def load(self):
+        from transformers import BitsAndBytesConfig, T5EncoderModel
+
         if "text2image" in self._abilities or "image2image" in self._abilities:
             from diffusers import AutoPipelineForText2Image as AutoPipelineModel
         elif "inpainting" in self._abilities:
@@ -200,7 +215,9 @@ def load(self):
                 glob(os.path.join(self._model_path, "*/*.safetensors"))
             )
         if isinstance(torch_dtype, str):
-            self._kwargs["torch_dtype"] = getattr(torch, torch_dtype)
+            self._torch_dtype = torch_dtype = self._kwargs["torch_dtype"] = getattr(
+                torch, torch_dtype
+            )
 
         controlnet = self._kwargs.get("controlnet")
         if controlnet is not None:
@@ -212,18 +229,7 @@ def load(self):
                 ]
 
         quantize_text_encoder = self._kwargs.pop("quantize_text_encoder", None)
-        if quantize_text_encoder:
-            try:
-                from transformers import BitsAndBytesConfig, T5EncoderModel
-            except ImportError:
-                error_message = "Failed to import module 'transformers'"
-                installation_guide = [
-                    "Please make sure 'transformers' is installed. ",
-                    "You can install it by `pip install transformers`\n",
-                ]
-
-                raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
-
+        if quantize_text_encoder and not self._gguf_model_path:
             try:
                 import bitsandbytes  # noqa: F401
             except ImportError:
@@ -249,6 +255,32 @@ def load(self):
                 self._kwargs[text_encoder_name] = text_encoder
                 self._kwargs["device_map"] = "balanced"
 
+        if self._gguf_model_path:
+            from diffusers import GGUFQuantizationConfig
+
+            # GGUF transformer
+            self._kwargs["transformer"] = self._get_layer_cls(
+                "transformer"
+            ).from_single_file(
+                self._gguf_model_path,
+                quantization_config=GGUFQuantizationConfig(compute_dtype=torch_dtype),
+                torch_dtype=torch_dtype,
+                config=os.path.join(self._model_path, "transformer"),
+            )
+        elif self._kwargs.get("transformer_nf4"):
+            nf4_config = BitsAndBytesConfig(
+                load_in_4bit=True,
+                bnb_4bit_quant_type="nf4",
+                bnb_4bit_compute_dtype=torch_dtype,
+            )
+            model_nf4 = self._get_layer_cls("transformer").from_pretrained(
+                self._model_path,
+                subfolder="transformer",
+                quantization_config=nf4_config,
+                torch_dtype=torch_dtype,
+            )
+            self._kwargs["transformer"] = model_nf4
+
         logger.debug(
             "Loading model from %s, kwargs: %s", self._model_path, self._kwargs
         )