From b8b6781c0e1c546bb046273de2147c507727fb46 Mon Sep 17 00:00:00 2001
From: Li Xiang <54010254+lixiang007666@users.noreply.github.com>
Date: Sun, 7 Jul 2024 00:08:49 +0800
Subject: [PATCH] Refine pixart readme for sigma support (#986)

This PR is done:

- [x] Refine pixart readme for sigma.

---------

Co-authored-by: Xiaoyu Xu <xiaoyulink@gmail.com>
---
 benchmarks/text_to_image.py                   | 11 ++-
 .../{pixart_alpha => pixart}/README.md        | 76 ++++++++++++++-----
 .../infer_compiler/backends/nexfort/README.md |  3 +-
 3 files changed, 69 insertions(+), 21 deletions(-)
 rename onediff_diffusers_extensions/examples/{pixart_alpha => pixart}/README.md (66%)

diff --git a/benchmarks/text_to_image.py b/benchmarks/text_to_image.py
index a0f565770..85b47c68e 100644
--- a/benchmarks/text_to_image.py
+++ b/benchmarks/text_to_image.py
@@ -36,6 +36,7 @@
 from diffusers.utils import load_image
 
 from onediffx import compile_pipe, quantize_pipe # quantize_pipe currently only supports the nexfort backend.
+from onediff.infer_compiler import oneflow_compile
 
 
 def parse_args():
@@ -244,7 +245,13 @@ def main():
         pass
     elif args.compiler == "oneflow":
         print("Oneflow backend is now active...")
-        pipe = compile_pipe(pipe)
+        # Note: The compile_pipe() based on the oneflow backend is incompatible with T5EncoderModel.
+        # pipe = compile_pipe(pipe)
+        if hasattr(pipe, "unet"):
+            pipe.unet = oneflow_compile(pipe.unet)
+        if hasattr(pipe, "transformer"):
+            pipe.transformer = oneflow_compile(pipe.transformer)
+        pipe.vae.decoder = oneflow_compile(pipe.vae.decoder)
     elif args.compiler == "nexfort":
         print("Nexfort backend is now active...")
         if args.quantize:
@@ -267,7 +274,7 @@ def main():
             options = json.loads(args.compiler_config)
         else:
             # config with string
-            options = '{"mode": "max-optimize:max-autotune:freezing", "memory_format": "channels_last"}'
+            options = '{"mode": "max-optimize:max-autotune:low-precision", "memory_format": "channels_last"}'
         pipe = compile_pipe(
             pipe, backend="nexfort", options=options, fuse_qkv_projections=True
         )
diff --git a/onediff_diffusers_extensions/examples/pixart_alpha/README.md b/onediff_diffusers_extensions/examples/pixart/README.md
similarity index 66%
rename from onediff_diffusers_extensions/examples/pixart_alpha/README.md
rename to onediff_diffusers_extensions/examples/pixart/README.md
index 30f13f4d2..25e61f1d0 100644
--- a/onediff_diffusers_extensions/examples/pixart_alpha/README.md
+++ b/onediff_diffusers_extensions/examples/pixart/README.md
@@ -1,10 +1,10 @@
-# Run PixArt alpha with nexfort backend(Beta Release)
+# Run PixArt with nexfort backend (Beta Release)
 
 
 1. [Environment Setup](#environment-setup)
    - [Set up onediff](#set-up-onediff)
    - [Set up nexfort backend](#set-up-nexfort-backend)
-   - [Set up PixArt alpha](#set-up-pixart-alpha)
+   - [Set up PixArt](#set-up-pixart)
 2. [Run](#run)
    - [Run 1024*1024 without compile](#run-10241024-without-compile)
    - [Run 1024*1024 with compile](#run-10241024-with-compile)
@@ -22,21 +22,31 @@ https://github.com/siliconflow/onediff?tab=readme-ov-file#installation
 ### Set up nexfort backend
 https://github.com/siliconflow/onediff/tree/main/src/onediff/infer_compiler/backends/nexfort
 
-### Set up PixArt alpha
-HF model: https://huggingface.co/PixArt-alpha/PixArt-XL-2-1024-MS
+### Set up PixArt
+
+
+HF model: 
+
+ - PixArt-alpha: https://huggingface.co/PixArt-alpha/PixArt-XL-2-1024-MS
+ - PixArt-sigma: https://huggingface.co/PixArt-alpha/PixArt-Sigma-XL-2-1024-MS
 
 HF pipeline: https://huggingface.co/docs/diffusers/main/en/api/pipelines/pixart
 
 ## Run
+
 model_id_or_path_to_PixArt-XL-2-1024-MS is the model id or model path of pixart alpha, such as `PixArt-alpha/PixArt-XL-2-1024-MS` or `/data/hf_models/PixArt-XL-2-1024-MS/`
 
+> [!NOTE]
+Compared to PixArt-alpha, PixArt-sigma extends the token length of the text encoder and introduces a new attention module capable of compressing key and value tokens, yet it still maintains consistency in the model architecture. The nexfort backend of onediff can support compilation acceleration for any version of PixArt.
+
 ### Go to the onediff folder
 ```
 cd onediff
 ```
 
-### Run 1024*1024 without compile(the original pytorch HF diffusers pipeline)
+### Run 1024*1024 without compile (the original pytorch HF diffusers pipeline)
 ```
+# To test sigma, specify the --model parameter as `PixArt-alpha/PixArt-Sigma-XL-2-1024-MS`.
 python3 ./benchmarks/text_to_image.py \
 --model PixArt-alpha/PixArt-XL-2-1024-MS \
 --scheduler none \
@@ -46,7 +56,19 @@ python3 ./benchmarks/text_to_image.py \
 --prompt "product photography, world of warcraft orc warrior, white background"
 ```
 
-### Run 1024*1024 with compile
+### Run 1024*1024 with oneflow backend compile
+
+```
+python3 ./benchmarks/text_to_image.py \
+--model PixArt-alpha/PixArt-XL-2-1024-MS \
+--scheduler none \
+--steps 20 \
+--compiler oneflow \
+--output-image ./pixart_alpha_compile.png \
+--prompt "product photography, world of warcraft orc warrior, white background"
+```
+
+### Run 1024*1024 with nexfort backend compile
 ```
 python3 ./benchmarks/text_to_image.py \
 --model PixArt-alpha/PixArt-XL-2-1024-MS \
@@ -61,24 +83,40 @@ python3 ./benchmarks/text_to_image.py \
 
 ### Metric
 
+#### On 4090
+| Metric                                           |NVIDIA GeForce RTX 4090 (1024 * 1024)|
+| ------------------------------------------------ | ----------------------------------- |
+| Data update date(yyyy-mm-dd)                     | 2024-07-06                          |
+| PyTorch iteration speed                          | 7.591it/s                           |
+| OneDiff iteration speed                          | 14.308it/s(+88.5%)                  |
+| PyTorch E2E time                                 | 2.881s                              |
+| OneDiff E2E time                                 | 1.509s(-47.6%)                      |
+| PyTorch Max Mem Used                             | 14.447GiB                           |
+| OneDiff Max Mem Used                             | 13.571GiB                           |
+| PyTorch Warmup with Run time                     | 3.314s                              |
+| OneDiff Warmup with Compilation time<sup>1</sup> | 244.500s                            |
+| OneDiff Warmup with Cache time                   | 80.866s                             |
+
+ <sup>1</sup> OneDiff warmup with compilation time is tested on AMD EPYC 7543 32-Core Processor. Note this is just for reference, and it varies a lot on different CPU.
+
 #### On A100
 | Metric                                           | NVIDIA A100-PCIE-40GB (1024 * 1024) |
 | ------------------------------------------------ | ----------------------------------- |
 | Data update date(yyyy-mm-dd)                     | 2024-05-23                          |
 | PyTorch iteration speed                          | 8.623it/s                           |
-| OneDiff iteration speed                          | 10.743it/s(+24.6%)                 |
+| OneDiff iteration speed                          | 10.743it/s(+24.6%)                  |
 | PyTorch E2E time                                 | 2.568s                              |
 | OneDiff E2E time                                 | 1.992s(-22.4%)                      |
 | PyTorch Max Mem Used                             | 14.445GiB                           |
 | OneDiff Max Mem Used                             | 13.855GiB                           |
 | PyTorch Warmup with Run time                     | 4.100s                              |
-| OneDiff Warmup with Compilation time<sup>1</sup> | 510.170s                            |
+| OneDiff Warmup with Compilation time<sup>2</sup> | 510.170s                            |
 | OneDiff Warmup with Cache time                   | 111.563s                            |
 
- <sup>1</sup> OneDiff Warmup with Compilation time is tested on Intel(R) Xeon(R) Gold 6348 CPU @ 2.60GHz. Note this is just for reference, and it varies a lot on different CPU.
+ <sup>2</sup> Intel(R) Xeon(R) Gold 6348 CPU @ 2.60GHz.
 
 #### On H800
-| Metric                                           |      NVIDIA H800 (1024 * 1024)      |
+| Metric                                           |  NVIDIA H800-NVL-80GB (1024 * 1024) |
 | ------------------------------------------------ | ----------------------------------- |
 | Data update date(yyyy-mm-dd)                     | 2024-05-29                          |
 | PyTorch iteration speed                          | 21.067it/s                          |
@@ -88,21 +126,25 @@ python3 ./benchmarks/text_to_image.py \
 | PyTorch Max Mem Used                             | 14.468GiB                           |
 | OneDiff Max Mem Used                             | 13.970GiB                           |
 | PyTorch Warmup with Run time                     | 1.741s                              |
-| OneDiff Warmup with Compilation time<sup>2</sup> | 718.539s                            |
+| OneDiff Warmup with Compilation time<sup>3</sup> | 718.539s                            |
 | OneDiff Warmup with Cache time                   | 131.776s                            |
 
- <sup>2</sup> Intel(R) Xeon(R) Platinum 8468.
+ <sup>3</sup> Intel(R) Xeon(R) Platinum 8468.
+
+#### The nexfort backend compile config and warmup cost
 
-#### nexfort compile config and warmup cost
 - compiler-config 
-  - default is `{"mode": "max-optimize:max-autotune:freezing", "memory_format": "channels_last"}` in `/benchmarks/text_to_image.py`, the compilation time is about 500 seconds
-  - setting `--compiler-config '{"mode": "max-autotune", "memory_format": "channels_last"}'` will reduce compilation time to about 60 seconds and just slightly reduce the performance
-  - setting `--compiler-config '{"mode": "max-optimize:max-autotune:freezing:benchmark:low-precision:cudagraphs", "memory_format": "channels_last"}'` will help to make the best performance but the compilation time is about 700 seconds
-  - setting `--compiler-config '{"mode": "jit:disable-runtime-fusion", "memory_format": "channels_last"}'` will reduce compilation time to 20 seconds, but will reduce the performance
+  - default is `{"mode": "max-optimize:max-autotune:low-precision", "memory_format": "channels_last"}` in `/benchmarks/text_to_image.py`. This mode supports dynamic shapes.
+  - setting `--compiler-config '{"mode": "max-autotune", "memory_format": "channels_last"}'` will reduce compilation time and just slightly reduce the performance.
+  - setting `--compiler-config '{"mode": "max-optimize:max-autotune:freezing:benchmark:low-precision:cudagraphs", "memory_format": "channels_last"}'` will help achieve the best performance, but it increases the compilation time and affects stability.
+  - setting `--compiler-config '{"mode": "jit:disable-runtime-fusion", "memory_format": "channels_last"}'` will reduce compilation time to 20 seconds, but will reduce the performance.
 - fuse_qkv_projections: True
 
 ## Quantization
 
+> [!NOTE]
+Quantization is a feature for onediff enterprise.
+
 Onediff's nexfort backend works closely with Torchao to support model quantization. Quant can reduce the runtime memory requirement and increase the inference speed.
 
 ### Run
diff --git a/src/onediff/infer_compiler/backends/nexfort/README.md b/src/onediff/infer_compiler/backends/nexfort/README.md
index 93231bff1..d52deab97 100644
--- a/src/onediff/infer_compiler/backends/nexfort/README.md
+++ b/src/onediff/infer_compiler/backends/nexfort/README.md
@@ -42,8 +42,7 @@ python3 -m nexfort.utils.clear_inductor_cache
 Advanced cache functionality is currently in development.
 
 ### Dynamic shape
-Onediff's nexfort backend also supports out-of-the-box dynamic shape inference. You just need to enable `dynamic` during compilation, as in `'{"mode": "max-autotune
-", "dynamic": true}'`. To understand how dynamic shape support works, please refer to the <https://pytorch.org/docs/stable/generated/torch.compile.html> and <https://github.com/pytorch/pytorch/blob/main/docs/source/torch.compiler_dynamic_shapes.rst> page. To avoid over-specialization and re-compilation, you need to initially call your model with a non-typical shape. For example: you can first call your Stable Diffusion model with a shape of 512x768 (height != width).
+Onediff's nexfort backend also supports out-of-the-box dynamic shape inference. You just need to enable `dynamic` during compilation, as in `'{"mode": "max-autotune", "dynamic": true}'`. To understand how dynamic shape support works, please refer to the <https://pytorch.org/docs/stable/generated/torch.compile.html> and <https://github.com/pytorch/pytorch/blob/main/docs/source/torch.compiler_dynamic_shapes.rst> page. To avoid over-specialization and re-compilation, you need to initially call your model with a non-typical shape. For example: you can first call your Stable Diffusion model with a shape of 512x768 (height != width).
 
 Test SDXL:
 ```