wan 2.2 image2video support

This commit is contained in:
deepbeepmeep 2025-07-30 22:21:30 +02:00
parent 2f4795f754
commit 4137a86e1f
8 changed files with 106 additions and 67 deletions

View File

@ -20,6 +20,15 @@ WanGP supports the Wan (and derived models), Hunyuan Video and LTV Video models
**Follow DeepBeepMeep on Twitter/X to get the Latest News**: https://x.com/deepbeepmeep
## 🔥 Latest Updates :
### July 30 2025: WanGP v7.5: Just another release ... Wan 2.2 part 2
Here is now Wan 2.2 image2video a very good model if you want to set Start and End frames. Two Wan 2.2 models delivered, only one to go ...
Please note that although it is an image2video model it is structurally very close to Wan 2.2 text2video (same layers with only a different initial projection). Given that Wan 2.1 image2video loras don't work too well (half of their tensors are not supported), I have decided that this model will look for its loras in the text2video loras folder instead of the image2video foler.
I have also optimized RAM management with Wan 2.2 so that loras and modules will be loaded only once in RAM and Reserved RAM, this saves up 5 GB of RAM which can make a difference...
And this time I really removed Vace Cocktail Light which gave a blurry vision.
### July 29 2025: WanGP v7.4: Just another release ... Wan 2.2 Preview
Wan 2.2 is here. The good news is that WanGP wont require a single byte of extra VRAM to run it and it will be as fast as Wan 2.1. The bad news is that you will need much more RAM if you want to leverage entirely this new model since it has twice has many parameters.

View File

@ -1,14 +1,14 @@
{
"_class_name": "WanModel",
"_diffusers_version": "0.30.0",
"_diffusers_version": "0.33.0",
"dim": 5120,
"eps": 1e-06,
"ffn_dim": 13824,
"freq_dim": 256,
"in_dim": 36,
"model_type": "i2v",
"model_type": "i2v2_2",
"num_heads": 40,
"num_layers": 40,
"out_dim": 16,
"text_len": 512
}
}

24
defaults/i2v_2_2.json Normal file
View File

@ -0,0 +1,24 @@
{
"model":
{
"name": "Wan2.2 Image2video 14B",
"architecture" : "i2v_2_2",
"description": "Wan 2.2 Image 2 Video model. Contrary to the Wan Image2video 2.1 this model is structurally close to the t2v model. You will need consequently to store Loras for this model in the t2v Lora Folder.",
"URLs": [
"https://huggingface.co/DeepBeepMeep/Wan2.2/resolve/main/wan2.2_image2video_14B_high_mbf16.safetensors",
"https://huggingface.co/DeepBeepMeep/Wan2.2/resolve/main/wan2.2_image2video_14B_high_quanto_mbf16_int8.safetensors",
"https://huggingface.co/DeepBeepMeep/Wan2.2/resolve/main/wan2.2_image2video_14B_high_quanto_mfp16_int8.safetensors"
],
"URLs2": [
"https://huggingface.co/DeepBeepMeep/Wan2.2/resolve/main/wan2.2_image2video_14B_low_mbf16.safetensors",
"https://huggingface.co/DeepBeepMeep/Wan2.2/resolve/main/wan2.2_image2video_14B_low_quanto_mbf16_int8.safetensors",
"https://huggingface.co/DeepBeepMeep/Wan2.2/resolve/main/wan2.2_image2video_14B_low_quanto_mfp16_int8.safetensors"
],
"group": "wan2_2"
},
"switch_threshold" : 900,
"guidance_scale" : 3.5,
"guidance2_scale" : 3.5,
"flow_shift" : 5
}

View File

@ -1,18 +0,0 @@
{
"model": {
"name": "Wan2.2 Vace Cocktail Light 14B",
"architecture": "vace_14B",
"modules": [
"vace_14B"
],
"description": "This model has been created on the fly using the Wan text 2.2 video model and the Loras of FusioniX. The weight of the Detail Enhancer Lora has been reduced to improve identity preservation. Only the high noise part of the v2.2 model is used to reduce RAM usage.",
"URLs": "t2v_2_2",
"loras": "vace_14B_cocktail_2_2",
"loras_multipliers": "vace_14B_cocktail_2_2",
"group": "wan2_2"
},
"num_inference_steps": 10,
"guidance_scale": 1,
"guidance2_scale": 1,
"flow_shift": 2
}

View File

@ -227,7 +227,7 @@ def prepare_kontext(
img_cond_seq = None
img_cond_seq_ids = None
if img_cond_list == None: img_cond_list = []
for cond_no, img_cond in enumerate(img_cond_list):
width, height = img_cond.size
aspect_ratio = width / height

View File

@ -88,7 +88,8 @@ class WanAny2V:
tokenizer_path=os.path.join(checkpoint_dir, config.t5_tokenizer),
shard_fn= None)
if hasattr(config, "clip_checkpoint"):
# base_model_type = "i2v2_2"
if hasattr(config, "clip_checkpoint") and not base_model_type in ["i2v_2_2"]:
self.clip = CLIPModel(
dtype=config.clip_dtype,
device=self.device,
@ -108,30 +109,35 @@ class WanAny2V:
# with open(config_filename, 'r', encoding='utf-8') as f:
# config = json.load(f)
# sd = safetensors2.torch_load_file(xmodel_filename)
# model_filename = "c:/temp/wan2.2t2v/high/diffusion_pytorch_model-00001-of-00006.safetensors"
# model_filename = "c:/temp/wan2.2i2v/low/diffusion_pytorch_model-00001-of-00006.safetensors"
base_config_file = f"configs/{base_model_type}.json"
forcedConfigPath = base_config_file if len(model_filename) > 1 else None
# forcedConfigPath = base_config_file = f"configs/flf2v_720p.json"
# model_filename[1] = xmodel_filename
model_filename2 = None
if self.transformer_switch:
model_filename2 = model_filename[1:]
model_filename = model_filename[:1] + model_filename[2:]
self.model = offload.fast_load_transformers_model(model_filename, modelClass=WanModel,do_quantize= quantizeTransformer and not save_quantized, writable_tensors= False, defaultConfigPath=base_config_file , forcedConfigPath= forcedConfigPath)
if model_filename2 is not None:
self.model2 = offload.fast_load_transformers_model(model_filename2, modelClass=WanModel,do_quantize= quantizeTransformer and not save_quantized, writable_tensors= False, defaultConfigPath=base_config_file , forcedConfigPath= forcedConfigPath)
if self.transformer_switch:
shared_modules= {}
self.model = offload.fast_load_transformers_model(model_filename[:1], modules = model_filename[2:], modelClass=WanModel,do_quantize= quantizeTransformer and not save_quantized, writable_tensors= False, defaultConfigPath=base_config_file , forcedConfigPath= forcedConfigPath, return_shared_modules= shared_modules)
self.model2 = offload.fast_load_transformers_model(model_filename[1:2], modules = shared_modules, modelClass=WanModel,do_quantize= quantizeTransformer and not save_quantized, writable_tensors= False, defaultConfigPath=base_config_file , forcedConfigPath= forcedConfigPath)
shared_modules = None
else:
self.model = offload.fast_load_transformers_model(model_filename, modelClass=WanModel,do_quantize= quantizeTransformer and not save_quantized, writable_tensors= False, defaultConfigPath=base_config_file , forcedConfigPath= forcedConfigPath)
# self.model = offload.load_model_data(self.model, xmodel_filename )
# offload.load_model_data(self.model, "c:/temp/Phantom-Wan-1.3B.pth")
self.model.lock_layers_dtypes(torch.float32 if mixed_precision_transformer else dtype)
offload.change_dtype(self.model, dtype, True)
if self.model2 is not None:
self.model2.lock_layers_dtypes(torch.float32 if mixed_precision_transformer else dtype)
offload.change_dtype(self.model2, dtype, True)
# offload.save_model(self.model, "wan2.1_text2video_1.3B_mbf16.safetensors", do_quantize= False, config_file_path=base_config_file, filter_sd=sd)
# offload.save_model(self.model, "wan2.2_text2video_14B_high_mbf16.safetensors", config_file_path=base_config_file)
# offload.save_model(self.model, "wan2.2_text2video_14B_high_quanto_mfp16_int8.safetensors", do_quantize=True, config_file_path=base_config_file)
# offload.save_model(self.model, "wan2.2_image2video_14B_low_mbf16.safetensors", config_file_path=base_config_file)
# offload.save_model(self.model, "wan2.2_image2video_14B_low_quanto_mbf16_int8.safetensors", do_quantize=True, config_file_path=base_config_file)
self.model.eval().requires_grad_(False)
if self.model2 is not None:
self.model2.eval().requires_grad_(False)
if save_quantized:
from wgp import save_quantized_model
save_quantized_model(self.model, model_type, model_filename[0], dtype, base_config_file)
@ -480,8 +486,11 @@ class WanAny2V:
any_end_frame = False
if input_frames != None:
_ , preframes_count, height, width = input_frames.shape
lat_h, lat_w = height // self.vae_stride[1], width // self.vae_stride[2]
clip_context = self.clip.visual([input_frames[:, -1:]]) if model_type != "flf2v_720p" else self.clip.visual([input_frames[:, -1:], input_frames[:, -1:]])
lat_h, lat_w = height // self.vae_stride[1], width // self.vae_stride[2]
if hasattr(self, "clip"):
clip_context = self.clip.visual([input_frames[:, -1:]]) if model_type != "flf2v_720p" else self.clip.visual([input_frames[:, -1:], input_frames[:, -1:]])
else:
clip_context = None
input_frames = input_frames.to(device=self.device).to(dtype= self.VAE_dtype)
enc = torch.concat( [input_frames, torch.zeros( (3, frame_num-preframes_count, height, width),
device=self.device, dtype= self.VAE_dtype)],
@ -513,19 +522,24 @@ class WanAny2V:
self.patch_size[2] * self.patch_size[2])
h = lat_h * self.vae_stride[1]
w = lat_w * self.vae_stride[2]
clip_image_size = self.clip.model.image_size
img_interpolated = resize_lanczos(image_start, h, w).sub_(0.5).div_(0.5).unsqueeze(0).transpose(0,1).to(self.device) #, self.dtype
image_start = resize_lanczos(image_start, clip_image_size, clip_image_size)
image_start = image_start.sub_(0.5).div_(0.5).to(self.device) #, self.dtype
color_reference_frame = image_start.clone()
color_reference_frame = img_interpolated.clone()
if image_end!= None:
img_interpolated2 = resize_lanczos(image_end, h, w).sub_(0.5).div_(0.5).unsqueeze(0).transpose(0,1).to(self.device) #, self.dtype
image_end = resize_lanczos(image_end, clip_image_size, clip_image_size)
image_end = image_end.sub_(0.5).div_(0.5).to(self.device) #, self.dtype
if model_type == "flf2v_720p":
clip_context = self.clip.visual([image_start[:, None, :, :], image_end[:, None, :, :] if image_end != None else image_start[:, None, :, :]])
if hasattr(self, "clip"):
clip_image_size = self.clip.model.image_size
image_start = resize_lanczos(image_start, clip_image_size, clip_image_size)
image_start = image_start.sub_(0.5).div_(0.5).to(self.device) #, self.dtype
if image_end!= None:
image_end = resize_lanczos(image_end, clip_image_size, clip_image_size)
image_end = image_end.sub_(0.5).div_(0.5).to(self.device) #, self.dtype
if model_type == "flf2v_720p":
clip_context = self.clip.visual([image_start[:, None, :, :], image_end[:, None, :, :] if image_end != None else image_start[:, None, :, :]])
else:
clip_context = self.clip.visual([image_start[:, None, :, :]])
else:
clip_context = self.clip.visual([image_start[:, None, :, :]])
clip_context = None
if any_end_frame:
enc= torch.concat([
@ -563,7 +577,9 @@ class WanAny2V:
extended_overlapped_latents = lat_y[:, :overlapped_latents_frames_num].clone().unsqueeze(0)
y = torch.concat([msk, lat_y])
lat_y = None
kwargs.update({'clip_fea': clip_context, 'y': y})
kwargs.update({ 'y': y})
if not clip_context is None:
kwargs.update({'clip_fea': clip_context})
# Recam Master
if target_camera != None:

View File

@ -789,7 +789,7 @@ class WanModel(ModelMixin, ConfigMixin):
sd = new_sd
from wgp import test_class_i2v
if not test_class_i2v(model_type):
if not test_class_i2v(model_type) or model_type in ["i2v_2_2"]:
new_sd = {}
# convert loras for i2v to t2v
for k,v in sd.items():
@ -842,7 +842,7 @@ class WanModel(ModelMixin, ConfigMixin):
super().__init__()
assert model_type in ['t2v', 'i2v']
assert model_type in ['t2v', 'i2v', 'i2v2_2']
self.model_type = model_type
self.patch_size = patch_size
@ -889,7 +889,7 @@ class WanModel(ModelMixin, ConfigMixin):
# blocks
if vace_layers == None:
cross_attn_type = 't2v_cross_attn' if model_type == 't2v' else 'i2v_cross_attn'
cross_attn_type = 't2v_cross_attn' if model_type in ['t2v','i2v2_2'] else 'i2v_cross_attn'
self.blocks = nn.ModuleList([
WanAttentionBlock(cross_attn_type, dim, ffn_dim, num_heads,
window_size, qk_norm, cross_attn_norm, eps, block_no =i, output_dim=multitalk_output_dim, norm_input_visual=norm_input_visual)

46
wgp.py
View File

@ -50,8 +50,8 @@ global_queue_ref = []
AUTOSAVE_FILENAME = "queue.zip"
PROMPT_VARS_MAX = 10
target_mmgp_version = "3.5.1"
WanGP_version = "7.4"
target_mmgp_version = "3.5.3"
WanGP_version = "7.5"
settings_version = 2.23
max_source_video_frames = 3000
prompt_enhancer_image_caption_model, prompt_enhancer_image_caption_processor, prompt_enhancer_llm_model, prompt_enhancer_llm_tokenizer = None, None, None, None
@ -1581,7 +1581,7 @@ def _parse_args():
def get_lora_dir(model_type):
model_family = get_model_family(model_type)
i2v = test_class_i2v(model_type)
i2v = test_class_i2v(model_type) and not get_base_model_type(model_type) == "i2v_2_2"
if model_family == "wan":
lora_dir =args.lora_dir
if i2v and len(lora_dir)==0:
@ -1691,7 +1691,8 @@ for path in ["wan2.1_Vace_1.3B_preview_bf16.safetensors", "sky_reels2_diffusion
"sky_reels2_diffusion_forcing_720p_14B_quanto_int8.safetensors", "sky_reels2_diffusion_forcing_720p_14B_quanto_fp16_int8.safetensors", "wan2.1_image2video_480p_14B_bf16.safetensors", "wan2.1_image2video_480p_14B_quanto_int8.safetensors",
"wan2.1_image2video_720p_14B_quanto_int8.safetensors", "wan2.1_image2video_720p_14B_quanto_fp16_int8.safetensors", "wan2.1_image2video_720p_14B_bf16.safetensors",
"wan2.1_text2video_14B_bf16.safetensors", "wan2.1_text2video_14B_quanto_int8.safetensors",
"wan2.1_Vace_14B_mbf16.safetensors", "wan2.1_Vace_14B_quanto_mbf16_int8.safetensors", "wan2.1_FLF2V_720p_14B_quanto_int8.safetensors", "wan2.1_FLF2V_720p_14B_bf16.safetensors", "wan2.1_FLF2V_720p_14B_fp16.safetensors", "wan2.1_Vace_1.3B_mbf16.safetensors", "wan2.1_text2video_1.3B_bf16.safetensors"
"wan2.1_Vace_14B_mbf16.safetensors", "wan2.1_Vace_14B_quanto_mbf16_int8.safetensors", "wan2.1_FLF2V_720p_14B_quanto_int8.safetensors", "wan2.1_FLF2V_720p_14B_bf16.safetensors", "wan2.1_FLF2V_720p_14B_fp16.safetensors", "wan2.1_Vace_1.3B_mbf16.safetensors", "wan2.1_text2video_1.3B_bf16.safetensors",
"ltxv_0.9.7_13B_dev_bf16.safetensors"
]:
if Path(os.path.join("ckpts" , path)).is_file():
print(f"Removing old version of model '{path}'. A new version of this model will be downloaded next time you use it.")
@ -1712,7 +1713,7 @@ modules_files = {
base_types = ["multitalk", "fantasy", "vace_14B", "vace_multitalk_14B",
"t2v_1.3B", "t2v", "vace_1.3B", "phantom_1.3B", "phantom_14B",
"recam_1.3B", "sky_df_1.3B", "sky_df_14B",
"i2v", "flf2v_720p", "fun_inp_1.3B", "fun_inp", "ltxv_13B",
"i2v", "i2v_2_2", "flf2v_720p", "fun_inp_1.3B", "fun_inp", "ltxv_13B",
"hunyuan", "hunyuan_i2v", "hunyuan_custom", "hunyuan_custom_audio", "hunyuan_custom_edit", "hunyuan_avatar", "flux"
]
@ -1792,7 +1793,7 @@ def get_model_family(model_type, for_ui = False):
def test_class_i2v(model_type):
model_type = get_base_model_type(model_type)
return model_type in ["i2v", "fun_inp_1.3B", "fun_inp", "flf2v_720p", "fantasy", "multitalk" ] #"hunyuan_i2v",
return model_type in ["i2v", "i2v_2_2", "fun_inp_1.3B", "fun_inp", "flf2v_720p", "fantasy", "multitalk" ] #"hunyuan_i2v",
def test_vace_module(model_type):
model_type = get_base_model_type(model_type)
@ -2632,7 +2633,8 @@ def load_wan_model(model_filename, model_type, base_model_type, model_def, quant
pipe = {"transformer": wan_model.model, "text_encoder" : wan_model.text_encoder.model, "vae": wan_model.vae.model }
if wan_model.model2 is not None:
pipe["transformer2"] = wan_model.model2
# del pipe["transformer"]
# pipe["transformer"] = wan_model.model
if hasattr(wan_model, "clip"):
pipe["text_encoder_2"] = wan_model.clip.model
return wan_model, pipe
@ -2803,10 +2805,9 @@ def load_models(model_type):
if "transformer2" in pipe:
loras_transformer += ["transformer2"]
if profile in [2,4]:
if profile in [3,4]:
kwargs["pinnedMemory"] = ["transformer", "transformer2"]
global prompt_enhancer_image_caption_model, prompt_enhancer_image_caption_processor, prompt_enhancer_llm_model, prompt_enhancer_llm_tokenizer
if server_config.get("enhancer_enabled", 0) == 1:
from transformers import ( AutoModelForCausalLM, AutoProcessor, AutoTokenizer, LlamaForCausalLM )
@ -3317,11 +3318,15 @@ def select_video(state, input_file_list, event_data: gr.EventData):
if video_length != frames_count: video_length_summary += f"real: {frames_count} frames, "
video_length_summary += f"{frames_count/fps:.1f}s, {round(fps)} fps)"
video_guidance_scale = configs.get("guidance_scale", None)
video_guidance2_scale = configs.get("guidance2_scale", None)
video_switch_threshold = configs.get("switch_threshold", 0)
video_embedded_guidance_scale = configs.get("embedded_guidance_scale ", None)
if model_family in ["hunyuan", "flux"]:
video_guidance_scale = video_embedded_guidance_scale
video_guidance_label = "Embedded Guidance Scale"
else:
if video_switch_threshold > 0:
video_guidance_scale = f"{video_guidance_scale} (High Noise), {video_guidance2_scale} (Low Noise) with Switch at Noise Level {video_switch_threshold}"
video_guidance_label = "Guidance"
video_flow_shift = configs.get("flow_shift", None)
video_video_guide_outpainting = configs.get("video_guide_outpainting", "")
@ -4232,15 +4237,15 @@ def generate_video(
loras_selected = transformer_loras_filenames + loras_selected
loras_list_mult_choices_nums = transformer_loras_multipliers + loras_list_mult_choices_nums
loras_slists = transformer_loras_multipliers + loras_slists
trans_list = [trans]
if trans2 is not None: trans_list += [trans2]
for trans_item in trans_list:
offload.load_loras_into_model(trans_item, loras_selected, loras_list_mult_choices_nums, activate_all_loras=True, preprocess_sd=get_loras_preprocessor(trans, base_model_type), pinnedLora=pinnedLora, split_linear_modules_map = split_linear_modules_map)
errors = trans._loras_errors
if len(errors) > 0:
error_files = [msg for _ , msg in errors]
raise gr.Error("Error while loading Loras: " + ", ".join(error_files))
trans_item = trans_list = None
offload.load_loras_into_model(trans , loras_selected, loras_list_mult_choices_nums, activate_all_loras=True, preprocess_sd=get_loras_preprocessor(trans, base_model_type), pinnedLora=pinnedLora, split_linear_modules_map = split_linear_modules_map)
errors = trans._loras_errors
if len(errors) > 0:
error_files = [msg for _ , msg in errors]
raise gr.Error("Error while loading Loras: " + ", ".join(error_files))
if trans2 is not None:
offload.sync_models_loras(trans, trans2)
seed = None if seed == -1 else seed
# negative_prompt = "" # not applicable in the inference
original_filename = model_filename
@ -4619,7 +4624,9 @@ def generate_video(
if len(extra_process_list) == 1:
status_info += " and " + processes_names[extra_process_list[0]]
elif len(extra_process_list) == 2:
status_info += ", " + processes_names[extra_process_list[0]] + " and " + processes_names[extra_process_list[1]]
status_info += ", " + processes_names[extra_process_list[0]] + " and " + processes_names[extra_process_list[1]]
if preprocess_type2 is not None:
context_scale = [ control_net_weight /2, control_net_weight2 /2]
send_cmd("progress", [0, get_latest_status(state, status_info)])
video_guide_processed, video_mask_processed = preprocess_video_with_mask(video_guide, video_mask, height=image_size[0], width = image_size[1], max_frames= len(keep_frames_parsed) , start_frame = guide_start_frame, fit_canvas = sample_fit_canvas, target_fps = fps, process_type = preprocess_type, expand_scale = mask_expand, RGB_Mask = True, negate_mask = "N" in video_prompt_type, process_outside_mask = process_outside_mask, outpainting_dims = outpainting_dims, proc_no =1 )
if preprocess_type2 != None:
@ -4768,6 +4775,7 @@ def generate_video(
remove_temp_filenames(temp_filenames_list)
offloadobj.unload_all()
offload.unload_loras_from_model(trans)
if trans is not None: offload.unload_loras_from_model(trans2)
# if compile:
# cache_size = torch._dynamo.config.cache_size_limit
# torch.compiler.reset()