mirror of
https://github.com/Wan-Video/Wan2.1.git
synced 2025-11-04 14:16:57 +00:00
added fast wan 5B and fixed pytorch compilation
This commit is contained in:
parent
33514a89cb
commit
ae5de88cbf
@ -20,6 +20,10 @@ WanGP supports the Wan (and derived models), Hunyuan Video and LTV Video models
|
||||
**Follow DeepBeepMeep on Twitter/X to get the Latest News**: https://x.com/deepbeepmeep
|
||||
|
||||
## 🔥 Latest Updates :
|
||||
### August 8 2025: WanGP v7.75 - Faster than the VAE ...
|
||||
We have a funny one here today: FastWan 2.2 5B, the Fastest Video Generator, only 20s to generate 121 frames at 720p. The snag is that VAE is twice as slow...
|
||||
Thanks to Kijai for extracting the Lora that is used to build the corresponding finetune.
|
||||
|
||||
### August 8 2025: WanGP v7.74 - Qwen Rebirth part 2
|
||||
Added support for Qwen Lightning lora for a 8 steps generation (https://huggingface.co/lightx2v/Qwen-Image-Lightning/blob/main/Qwen-Image-Lightning-8steps-V1.0.safetensors). Lora is not normalized and you can use a multiplier around 0.1.
|
||||
|
||||
|
||||
15
configs/i2v_2_2_multitalk.json
Normal file
15
configs/i2v_2_2_multitalk.json
Normal file
@ -0,0 +1,15 @@
|
||||
{
|
||||
"_class_name": "WanModel",
|
||||
"_diffusers_version": "0.33.0",
|
||||
"dim": 5120,
|
||||
"eps": 1e-06,
|
||||
"ffn_dim": 13824,
|
||||
"freq_dim": 256,
|
||||
"in_dim": 36,
|
||||
"model_type": "i2v2_2",
|
||||
"num_heads": 40,
|
||||
"num_layers": 40,
|
||||
"out_dim": 16,
|
||||
"text_len": 512,
|
||||
"multitalk_output_dim": 768
|
||||
}
|
||||
18
defaults/i2v_2_2_multitalk.json
Normal file
18
defaults/i2v_2_2_multitalk.json
Normal file
@ -0,0 +1,18 @@
|
||||
{
|
||||
"model":
|
||||
{
|
||||
"name": "Wan2.2 Multitalk 14B",
|
||||
"architecture" : "i2v_2_2_multitalk",
|
||||
"description": "The Multitalk module of Wan 2.1 has been combined with the Wan 2.2 image 2 video. It lets you have up to two people have a conversation.",
|
||||
"modules": ["multitalk"],
|
||||
"URLs": "i2v_2_2",
|
||||
"URLs2": "i2v_2_2",
|
||||
"group": "wan2_2",
|
||||
"visible": false
|
||||
},
|
||||
"switch_threshold" : 900,
|
||||
"guidance_scale" : 3.5,
|
||||
"guidance2_scale" : 3.5,
|
||||
"flow_shift" : 5
|
||||
|
||||
}
|
||||
15
defaults/ti2v_2_2_fastwan.json
Normal file
15
defaults/ti2v_2_2_fastwan.json
Normal file
@ -0,0 +1,15 @@
|
||||
{
|
||||
"model": {
|
||||
"name": "Wan2.2 FastWan TextImage2video 5B",
|
||||
"architecture": "ti2v_2_2",
|
||||
"description": "FastWan2.2-TI2V-5B-Full-Diffusers is built upon Wan-AI/Wan2.2-TI2V-5B-Diffusers. It supports efficient 3-step inference and produces high-quality videos at 121×704×1280 resolution",
|
||||
"URLs": "ti2v_2_2",
|
||||
"loras": ["https://huggingface.co/DeepBeepMeep/Wan2.2/resolve/main/loras_accelerators/Wan2_2_5B_FastWanFullAttn_lora_rank_128_bf16.safetensors"],
|
||||
"group": "wan2_2"
|
||||
},
|
||||
"video_length": 121,
|
||||
"guidance_scale": 1,
|
||||
"flow_shift": 3,
|
||||
"num_inference_steps": 3,
|
||||
"resolution": "1280x720"
|
||||
}
|
||||
@ -32,7 +32,7 @@ class family_handler():
|
||||
return {}, {}
|
||||
|
||||
@staticmethod
|
||||
def get_rgb_factors(model_type):
|
||||
def get_rgb_factors(base_model_type ):
|
||||
from shared.RGB_factors import get_rgb_factors
|
||||
latent_rgb_factors, latent_rgb_factors_bias = get_rgb_factors("flux")
|
||||
return latent_rgb_factors, latent_rgb_factors_bias
|
||||
|
||||
@ -72,7 +72,7 @@ class family_handler():
|
||||
return {"hunyuan":(20, "Hunyuan Video")}
|
||||
|
||||
@staticmethod
|
||||
def get_rgb_factors(model_type):
|
||||
def get_rgb_factors(base_model_type ):
|
||||
from shared.RGB_factors import get_rgb_factors
|
||||
latent_rgb_factors, latent_rgb_factors_bias = get_rgb_factors("hunyuan")
|
||||
return latent_rgb_factors, latent_rgb_factors_bias
|
||||
|
||||
@ -38,7 +38,7 @@ class family_handler():
|
||||
return {}, {}
|
||||
|
||||
@staticmethod
|
||||
def get_rgb_factors(model_type):
|
||||
def get_rgb_factors(base_model_type ):
|
||||
from shared.RGB_factors import get_rgb_factors
|
||||
latent_rgb_factors, latent_rgb_factors_bias = get_rgb_factors("ltxv")
|
||||
return latent_rgb_factors, latent_rgb_factors_bias
|
||||
|
||||
@ -469,13 +469,20 @@ class QwenImageTransformer2DModel(nn.Module):
|
||||
|
||||
|
||||
def preprocess_loras(self, model_type, sd):
|
||||
new_sd = {}
|
||||
for k,v in sd.items():
|
||||
if k.startswith("transformer_blocks"):
|
||||
k = "diffusion_model." + k
|
||||
new_sd[k] = v
|
||||
sd = new_sd
|
||||
return sd
|
||||
|
||||
first = next(iter(sd), None)
|
||||
if first == None:
|
||||
return sd
|
||||
if first.startswith("transformer_blocks"):
|
||||
new_sd = {}
|
||||
for k,v in sd.items():
|
||||
if k.startswith("transformer_blocks"):
|
||||
k = "diffusion_model." + k
|
||||
new_sd[k] = v
|
||||
sd = new_sd
|
||||
return sd
|
||||
else:
|
||||
return sd
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
||||
@ -87,17 +87,17 @@ class WanAny2V:
|
||||
dtype=config.t5_dtype,
|
||||
device=torch.device('cpu'),
|
||||
checkpoint_path=text_encoder_filename,
|
||||
tokenizer_path=os.path.join(checkpoint_dir, config.t5_tokenizer),
|
||||
tokenizer_path=os.path.join(checkpoint_dir, "umt5-xxl"),
|
||||
shard_fn= None)
|
||||
|
||||
# base_model_type = "i2v2_2"
|
||||
if hasattr(config, "clip_checkpoint") and not base_model_type in ["i2v_2_2"]:
|
||||
if hasattr(config, "clip_checkpoint") and not base_model_type in ["i2v_2_2", "i2v_2_2_multitalk"]:
|
||||
self.clip = CLIPModel(
|
||||
dtype=config.clip_dtype,
|
||||
device=self.device,
|
||||
checkpoint_path=os.path.join(checkpoint_dir ,
|
||||
config.clip_checkpoint),
|
||||
tokenizer_path=os.path.join(checkpoint_dir , config.clip_tokenizer))
|
||||
tokenizer_path=os.path.join(checkpoint_dir , "clip_vit_large_patch14"))
|
||||
|
||||
|
||||
if base_model_type in ["ti2v_2_2"]:
|
||||
@ -495,7 +495,7 @@ class WanAny2V:
|
||||
vace = model_type in ["vace_1.3B","vace_14B", "vace_multitalk_14B"]
|
||||
phantom = model_type in ["phantom_1.3B", "phantom_14B"]
|
||||
fantasy = model_type in ["fantasy"]
|
||||
multitalk = model_type in ["multitalk", "vace_multitalk_14B"]
|
||||
multitalk = model_type in ["multitalk", "vace_multitalk_14B", "i2v_2_2_multitalk"]
|
||||
recam = model_type in ["recam_1.3B"]
|
||||
ti2v = model_type in ["ti2v_2_2"]
|
||||
start_step_no = 0
|
||||
@ -505,7 +505,7 @@ class WanAny2V:
|
||||
timestep_injection = False
|
||||
lat_frames = int((frame_num - 1) // self.vae_stride[0]) + 1
|
||||
# image2video
|
||||
if model_type in ["i2v", "i2v_2_2", "fun_inp_1.3B", "fun_inp", "fantasy", "multitalk", "flf2v_720p"]:
|
||||
if model_type in ["i2v", "i2v_2_2", "fun_inp_1.3B", "fun_inp", "fantasy", "multitalk", "i2v_2_2_multitalk", "flf2v_720p"]:
|
||||
any_end_frame = False
|
||||
if image_start is None:
|
||||
_ , preframes_count, height, width = input_video.shape
|
||||
|
||||
@ -10,7 +10,7 @@ i2v_14B = EasyDict(__name__='Config: Wan I2V 14B')
|
||||
i2v_14B.update(wan_shared_cfg)
|
||||
|
||||
i2v_14B.t5_checkpoint = 'models_t5_umt5-xxl-enc-bf16.pth'
|
||||
i2v_14B.t5_tokenizer = 'google/umt5-xxl'
|
||||
i2v_14B.t5_tokenizer = 'umt5-xxl'
|
||||
|
||||
# clip
|
||||
i2v_14B.clip_model = 'clip_xlm_roberta_vit_h_14'
|
||||
|
||||
@ -10,7 +10,7 @@ t2v_14B.update(wan_shared_cfg)
|
||||
|
||||
# t5
|
||||
t2v_14B.t5_checkpoint = 'models_t5_umt5-xxl-enc-bf16.pth'
|
||||
t2v_14B.t5_tokenizer = 'google/umt5-xxl'
|
||||
t2v_14B.t5_tokenizer = 'umt5-xxl'
|
||||
|
||||
# vae
|
||||
t2v_14B.vae_checkpoint = 'Wan2.1_VAE.pth'
|
||||
|
||||
@ -10,7 +10,7 @@ t2v_1_3B.update(wan_shared_cfg)
|
||||
|
||||
# t5
|
||||
t2v_1_3B.t5_checkpoint = 'models_t5_umt5-xxl-enc-bf16.pth'
|
||||
t2v_1_3B.t5_tokenizer = 'google/umt5-xxl'
|
||||
t2v_1_3B.t5_tokenizer = 'umt5-xxl'
|
||||
|
||||
# vae
|
||||
t2v_1_3B.vae_checkpoint = 'Wan2.1_VAE.pth'
|
||||
|
||||
@ -504,7 +504,6 @@ class T5EncoderModel:
|
||||
else:
|
||||
self.model.to(self.device)
|
||||
# init tokenizer
|
||||
tokenizer_path= "google/umt5-xxl"
|
||||
self.tokenizer = HuggingfaceTokenizer(
|
||||
name=tokenizer_path, seq_len=text_len, clean='whitespace')
|
||||
|
||||
|
||||
@ -2,7 +2,7 @@ import torch
|
||||
import numpy as np
|
||||
|
||||
def test_class_i2v(base_model_type):
|
||||
return base_model_type in ["i2v", "i2v_2_2", "fun_inp_1.3B", "fun_inp", "flf2v_720p", "fantasy", "multitalk", ] #"hunyuan_i2v",
|
||||
return base_model_type in ["i2v", "i2v_2_2", "fun_inp_1.3B", "fun_inp", "flf2v_720p", "fantasy", "multitalk", "i2v_2_2_multitalk" ] #"hunyuan_i2v",
|
||||
|
||||
def test_class_1_3B(base_model_type):
|
||||
return base_model_type in [ "vace_1.3B", "t2v_1.3B", "recam_1.3B","phantom_1.3B","fun_inp_1.3B"]
|
||||
@ -79,7 +79,7 @@ class family_handler():
|
||||
extra_model_def["no_steps_skipping"] = True
|
||||
i2v = test_class_i2v(base_model_type)
|
||||
extra_model_def["i2v_class"] = i2v
|
||||
|
||||
extra_model_def["multitalk_class"] = base_model_type in ["multitalk", "vace_multitalk_14B", "i2v_2_2_multitalk"]
|
||||
vace_class = base_model_type in ["vace_14B", "vace_1.3B", "vace_multitalk_14B"]
|
||||
extra_model_def["vace_class"] = vace_class
|
||||
|
||||
@ -118,7 +118,7 @@ class family_handler():
|
||||
return ["multitalk", "fantasy", "vace_14B", "vace_multitalk_14B",
|
||||
"t2v_1.3B", "t2v", "vace_1.3B", "phantom_1.3B", "phantom_14B",
|
||||
"recam_1.3B",
|
||||
"i2v", "i2v_2_2", "ti2v_2_2", "flf2v_720p", "fun_inp_1.3B", "fun_inp"]
|
||||
"i2v", "i2v_2_2", "i2v_2_2_multitalk", "ti2v_2_2", "flf2v_720p", "fun_inp_1.3B", "fun_inp"]
|
||||
|
||||
|
||||
@staticmethod
|
||||
@ -133,6 +133,7 @@ class family_handler():
|
||||
"vace_14B" : [ "vace_multitalk_14B"],
|
||||
"t2v" : [ "vace_14B", "vace_1.3B" "vace_multitalk_14B", "t2v_1.3B", "phantom_1.3B","phantom_14B"],
|
||||
"i2v" : [ "fantasy", "multitalk", "flf2v_720p" ],
|
||||
"i2v_2_2" : ["i2v_2_2_multitalk"],
|
||||
"fantasy": ["multitalk"],
|
||||
}
|
||||
return models_eqv_map, models_comp_map
|
||||
@ -150,9 +151,9 @@ class family_handler():
|
||||
return 32 if base_model_type == "ti2v_2_2" else 16
|
||||
|
||||
@staticmethod
|
||||
def get_rgb_factors(model_type):
|
||||
def get_rgb_factors(base_model_type ):
|
||||
from shared.RGB_factors import get_rgb_factors
|
||||
if model_type == "ti2v_2_2": return None, None
|
||||
if base_model_type == "ti2v_2_2": return None, None
|
||||
latent_rgb_factors, latent_rgb_factors_bias = get_rgb_factors("wan")
|
||||
return latent_rgb_factors, latent_rgb_factors_bias
|
||||
|
||||
|
||||
@ -17,7 +17,7 @@ gradio==5.23.0
|
||||
numpy>=1.23.5,<2
|
||||
einops
|
||||
moviepy==1.0.3
|
||||
mmgp==3.5.7
|
||||
mmgp==3.5.8
|
||||
peft==0.15.0
|
||||
mutagen
|
||||
pydantic==2.10.6
|
||||
|
||||
29
wgp.py
29
wgp.py
@ -44,13 +44,16 @@ from preprocessing.matanyone import app as matanyone_app
|
||||
from tqdm import tqdm
|
||||
import requests
|
||||
|
||||
# import torch._dynamo as dynamo
|
||||
# dynamo.config.recompile_limit = 2000 # default is 256
|
||||
# dynamo.config.accumulated_recompile_limit = 2000 # or whatever limit you want
|
||||
|
||||
global_queue_ref = []
|
||||
AUTOSAVE_FILENAME = "queue.zip"
|
||||
PROMPT_VARS_MAX = 10
|
||||
|
||||
target_mmgp_version = "3.5.7"
|
||||
WanGP_version = "7.74"
|
||||
target_mmgp_version = "3.5.8"
|
||||
WanGP_version = "7.75"
|
||||
settings_version = 2.23
|
||||
max_source_video_frames = 3000
|
||||
prompt_enhancer_image_caption_model, prompt_enhancer_image_caption_processor, prompt_enhancer_llm_model, prompt_enhancer_llm_tokenizer = None, None, None, None
|
||||
@ -1616,7 +1619,8 @@ def _parse_args():
|
||||
|
||||
def get_lora_dir(model_type):
|
||||
model_family = get_model_family(model_type)
|
||||
i2v = test_class_i2v(model_type) and not get_base_model_type(model_type) == "i2v_2_2"
|
||||
base_model_type = get_base_model_type(model_type)
|
||||
i2v = test_class_i2v(model_type) and base_model_type == "i2v_2_2"
|
||||
if model_family == "wan":
|
||||
lora_dir =args.lora_dir
|
||||
if i2v and len(lora_dir)==0:
|
||||
@ -1629,7 +1633,7 @@ def get_lora_dir(model_type):
|
||||
lora_dir_1_3B = os.path.join(root_lora_dir, "1.3B")
|
||||
if os.path.isdir(lora_dir_1_3B ):
|
||||
return lora_dir_1_3B
|
||||
elif model_type == "ti2v_2_2":
|
||||
elif base_model_type == "ti2v_2_2":
|
||||
lora_dir_5B = os.path.join(root_lora_dir, "5B")
|
||||
if os.path.isdir(lora_dir_5B ):
|
||||
return lora_dir_5B
|
||||
@ -3641,7 +3645,7 @@ def perform_spatial_upsampling(sample, spatial_upsampling):
|
||||
|
||||
def any_audio_track(model_type):
|
||||
base_model_type = get_base_model_type(model_type)
|
||||
return base_model_type in ["fantasy", "multitalk", "hunyuan_avatar", "hunyuan_custom_audio", "vace_multitalk_14B"]
|
||||
return base_model_type in ["fantasy", "hunyuan_avatar", "hunyuan_custom_audio"] or get_model_def(model_type).get("multitalk_class", False)
|
||||
|
||||
def get_available_filename(target_path, video_source, suffix = "", force_extension = None):
|
||||
name, extension = os.path.splitext(os.path.basename(video_source))
|
||||
@ -3950,6 +3954,12 @@ def generate_video(
|
||||
model_filename,
|
||||
mode,
|
||||
):
|
||||
# import os
|
||||
# os.environ.pop("TORCH_LOGS", None) # make sure no env var is suppressing/overriding
|
||||
# import torch._logging as tlog
|
||||
# tlog.set_logs(recompiles=True, guards=True, graph_breaks=True)
|
||||
|
||||
|
||||
|
||||
def remove_temp_filenames(temp_filenames_list):
|
||||
for temp_filename in temp_filenames_list:
|
||||
@ -4094,7 +4104,7 @@ def generate_video(
|
||||
hunyuan_custom_edit = hunyuan_custom and "edit" in model_filename
|
||||
hunyuan_avatar = "hunyuan_video_avatar" in model_filename
|
||||
fantasy = base_model_type in ["fantasy"]
|
||||
multitalk = base_model_type in ["multitalk", "vace_multitalk_14B"]
|
||||
multitalk = model_def.get("multitalk_class", False)
|
||||
flux = base_model_type in ["flux"]
|
||||
|
||||
if "B" in audio_prompt_type or "X" in audio_prompt_type:
|
||||
@ -4821,8 +4831,9 @@ def generate_preview(model_type, latents):
|
||||
import einops
|
||||
if latents is None: return None
|
||||
model_handler = get_model_handler(model_type)
|
||||
base_model_type = get_base_model_type(model_type)
|
||||
if hasattr(model_handler, "get_rgb_factors"):
|
||||
latent_rgb_factors, latent_rgb_factors_bias = model_handler.get_rgb_factors(model_type)
|
||||
latent_rgb_factors, latent_rgb_factors_bias = model_handler.get_rgb_factors(base_model_type )
|
||||
else:
|
||||
return None
|
||||
if latent_rgb_factors is None: return None
|
||||
@ -5520,7 +5531,7 @@ def prepare_inputs_dict(target, inputs, model_type = None, model_filename = None
|
||||
if not test_any_sliding_window( base_model_type):
|
||||
pop += ["sliding_window_size", "sliding_window_overlap", "sliding_window_overlap_noise", "sliding_window_discard_last_frames", "sliding_window_color_correction_strength"]
|
||||
|
||||
if not base_model_type in ["fantasy", "multitalk", "vace_multitalk_14B"]:
|
||||
if not (base_model_type in ["fantasy"] or model_def.get("multitalk_class", False)):
|
||||
pop += ["audio_guidance_scale", "speakers_locations"]
|
||||
|
||||
if not model_def.get("embedded_guidance", False) or model_def.get("no_guidance", False):
|
||||
@ -6505,7 +6516,7 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non
|
||||
vace = test_vace_module(base_model_type)
|
||||
phantom = base_model_type in ["phantom_1.3B", "phantom_14B"]
|
||||
fantasy = base_model_type in ["fantasy"]
|
||||
multitalk = base_model_type in ["multitalk", "vace_multitalk_14B"]
|
||||
multitalk = model_def.get("multitalk_class", False)
|
||||
hunyuan_t2v = "hunyuan_video_720" in model_filename
|
||||
hunyuan_i2v = "hunyuan_video_i2v" in model_filename
|
||||
hunyuan_video_custom = "hunyuan_video_custom" in model_filename
|
||||
|
||||
Loading…
Reference in New Issue
Block a user