Add more explanation about the context parallel (#260)

2025-06-03 22:04:53 +00:00 · 2025-03-20 17:04:49 +08:00 · 2025-03-20 17:04:49 +08:00 · 9fb8845009
commit 9fb8845009
parent 0f816057da
2 changed files with 19 additions and 6 deletions
--- a/README.md
+++ b/README.md
@ -147,6 +147,19 @@ python generate.py  --task t2v-1.3B --size 832*480 --ckpt_dir ./Wan2.1-T2V-1.3B

 - Multi-GPU inference using FSDP + xDiT USP

+  We use FSDP and [xDiT](https://github.com/xdit-project/xDiT) USP to accelerate  inference.
+
+  * Ulysess Strategy
+
+    If you want to use [`Ulysses`](https://arxiv.org/abs/2309.14509) strategy, you should set `--ulysses_size $GPU_NUMS`. Note that the `num_heads` should be divisible by `ulysses_size` if you wish to use `Ulysess` strategy. For the 1.3B model, the `num_heads` is `12` which can't be divided by 8 (as most multi-GPU machines have 8 GPUs). Therefore, it is recommended to use `Ring Strategy` instead.
+
+  * Ring Strategy
+
+    If you want to use [`Ring`](https://arxiv.org/pdf/2310.01889) strategy, you should set `--ring_size $GPU_NUMS`. Note that the `sequence length` should be divisible by `ring_size` when using the `Ring` strategy.
+
+  Of course, you can also combine the use of `Ulysses` and `Ring` strategies.
+
+
 ``` sh
 pip install "xfuser>=0.4.1"
 torchrun --nproc_per_node=8 generate.py --task t2v-14B --size 1280*720 --ckpt_dir ./Wan2.1-T2V-14B --dit_fsdp --t5_fsdp --ulysses_size 8 --prompt "Two anthropomorphic cats in comfy boxing gear and bright gloves fight intensely on a spotlighted stage."
@ -324,11 +337,11 @@ prompt = (
 negative_prompt = "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards"

 output = pipe(
-    image=image, 
-    prompt=prompt, 
-    negative_prompt=negative_prompt, 
-    height=height, width=width, 
-    num_frames=81, 
+    image=image,
+    prompt=prompt,
+    negative_prompt=negative_prompt,
+    height=height, width=width,
+    num_frames=81,
    guidance_scale=5.0
 ).frames[0]
 export_to_video(output, "output.mp4", fps=16)
--- a/generate.py
+++ b/generate.py
@ -260,7 +260,7 @@ def generate(args):

    cfg = WAN_CONFIGS[args.task]
    if args.ulysses_size > 1:
-        assert cfg.num_heads % args.ulysses_size == 0, f"`num_heads` must be divisible by `ulysses_size`."
+        assert cfg.num_heads % args.ulysses_size == 0, f"`{cfg.num_heads=}` cannot be divided evenly by `{args.ulysses_size=}`."

    logging.info(f"Generation job args: {args}")
    logging.info(f"Generation model config: {cfg}")