wan 2.2 image2video support

2026-01-11 16:53:34 +00:00 · 2025-07-30 22:21:30 +02:00 · 2025-07-30 22:21:30 +02:00 · 4137a86e1f
commit 4137a86e1f
parent 2f4795f754
8 changed files with 106 additions and 67 deletions
--- a/README.md
+++ b/README.md
@ -20,6 +20,15 @@ WanGP supports the Wan (and derived models), Hunyuan Video and LTV Video models
 **Follow DeepBeepMeep on Twitter/X to get the Latest News**: https://x.com/deepbeepmeep

 ## 🔥 Latest Updates : 
+### July 30 2025: WanGP v7.5:  Just another release ... Wan 2.2 part 2
+Here is now Wan 2.2 image2video a very good model if you want to set Start and End frames. Two Wan 2.2 models delivered, only one to go ...
+
+Please note that although it is an image2video model it is structurally very close to Wan 2.2 text2video (same layers with only a different initial projection). Given that Wan 2.1 image2video loras don't work too well (half of their tensors are not supported), I have decided that this model will look for its loras in the text2video loras folder instead of the image2video foler.
+
+I have also optimized RAM management with Wan 2.2 so that loras and modules will be loaded only once in RAM and Reserved RAM, this saves up 5 GB of RAM which can make a difference...
+
+And this time I really removed Vace Cocktail Light which gave a blurry vision.
+
 ### July 29 2025: WanGP v7.4:  Just another release ... Wan 2.2 Preview
 Wan 2.2 is here.  The good news is that WanGP wont require a single byte of extra VRAM to run it and it will be as fast as Wan 2.1. The bad news is that you will need much more RAM if you want to leverage entirely this new model since it has twice has many parameters.

--- a/configs/i2v_720p.json
+++ b/configs/i2v_720p.json
@ -1,14 +1,14 @@
 {
  "_class_name": "WanModel",
-  "_diffusers_version": "0.30.0",
+  "_diffusers_version": "0.33.0",
  "dim": 5120,
  "eps": 1e-06,
  "ffn_dim": 13824,
  "freq_dim": 256,
  "in_dim": 36,
-  "model_type": "i2v",
+  "model_type": "i2v2_2",
  "num_heads": 40,
  "num_layers": 40,
  "out_dim": 16,
  "text_len": 512
-}
+}
--- a/defaults/i2v_2_2.json
+++ b/defaults/i2v_2_2.json
@ -0,0 +1,24 @@
+{
+	"model":
+	{
+		"name": "Wan2.2 Image2video 14B",
+		"architecture" : "i2v_2_2",
+		"description": "Wan 2.2 Image 2 Video model. Contrary to the Wan Image2video 2.1 this model is structurally close to the t2v model. You will need consequently to store Loras for this model in the t2v Lora Folder.",
+		"URLs": [
+			"https://huggingface.co/DeepBeepMeep/Wan2.2/resolve/main/wan2.2_image2video_14B_high_mbf16.safetensors",
+			"https://huggingface.co/DeepBeepMeep/Wan2.2/resolve/main/wan2.2_image2video_14B_high_quanto_mbf16_int8.safetensors",
+			"https://huggingface.co/DeepBeepMeep/Wan2.2/resolve/main/wan2.2_image2video_14B_high_quanto_mfp16_int8.safetensors"
+		],
+		"URLs2": [
+			"https://huggingface.co/DeepBeepMeep/Wan2.2/resolve/main/wan2.2_image2video_14B_low_mbf16.safetensors",
+			"https://huggingface.co/DeepBeepMeep/Wan2.2/resolve/main/wan2.2_image2video_14B_low_quanto_mbf16_int8.safetensors",
+			"https://huggingface.co/DeepBeepMeep/Wan2.2/resolve/main/wan2.2_image2video_14B_low_quanto_mfp16_int8.safetensors"
+		],
+		"group": "wan2_2"		
+	},
+	"switch_threshold" : 900,
+	"guidance_scale" : 3.5,
+	"guidance2_scale" : 3.5,
+	"flow_shift" : 5
+	
+}
--- a/defaults/vace_14B_cocktail_2_2_light.json
+++ b/defaults/vace_14B_cocktail_2_2_light.json
@ -1,18 +0,0 @@
-{
-    "model": {
-        "name": "Wan2.2 Vace Cocktail Light 14B",
-        "architecture": "vace_14B",
-        "modules": [
-            "vace_14B"
-        ],
-        "description": "This model has been created on the fly using the Wan text 2.2 video model and the Loras of FusioniX. The weight of the Detail Enhancer Lora has been reduced to improve identity preservation. Only the high noise part of the v2.2 model is used to reduce RAM usage.",
-        "URLs": "t2v_2_2",
-		"loras": "vace_14B_cocktail_2_2",
-		"loras_multipliers": "vace_14B_cocktail_2_2",
-		"group": "wan2_2"
-    },
-    "num_inference_steps": 10,
-    "guidance_scale": 1,
-    "guidance2_scale": 1,
-    "flow_shift": 2
-}
--- a/flux/sampling.py
+++ b/flux/sampling.py
@ -227,7 +227,7 @@ def prepare_kontext(

    img_cond_seq = None
    img_cond_seq_ids = None
-
+    if img_cond_list == None: img_cond_list = []
    for cond_no, img_cond in enumerate(img_cond_list): 
        width, height = img_cond.size
        aspect_ratio = width / height
--- a/wan/any2video.py
+++ b/wan/any2video.py
@ -88,7 +88,8 @@ class WanAny2V:
            tokenizer_path=os.path.join(checkpoint_dir, config.t5_tokenizer),
            shard_fn= None)

-        if hasattr(config, "clip_checkpoint"):
+        # base_model_type = "i2v2_2"
+        if hasattr(config, "clip_checkpoint") and not base_model_type in ["i2v_2_2"]:
            self.clip = CLIPModel(
                dtype=config.clip_dtype,
                device=self.device,
@ -108,30 +109,35 @@ class WanAny2V:
        # with open(config_filename, 'r', encoding='utf-8') as f:
        #     config = json.load(f)
        # sd = safetensors2.torch_load_file(xmodel_filename)
-        # model_filename = "c:/temp/wan2.2t2v/high/diffusion_pytorch_model-00001-of-00006.safetensors"
+        # model_filename = "c:/temp/wan2.2i2v/low/diffusion_pytorch_model-00001-of-00006.safetensors"
        base_config_file = f"configs/{base_model_type}.json"
        forcedConfigPath = base_config_file if len(model_filename) > 1 else None
        # forcedConfigPath = base_config_file = f"configs/flf2v_720p.json"
        # model_filename[1] = xmodel_filename
-        model_filename2 = None
-        if self.transformer_switch:
-            model_filename2 = model_filename[1:] 
-            model_filename = model_filename[:1] + model_filename[2:]
-        self.model = offload.fast_load_transformers_model(model_filename, modelClass=WanModel,do_quantize= quantizeTransformer and not save_quantized, writable_tensors= False, defaultConfigPath=base_config_file , forcedConfigPath= forcedConfigPath)
-        if model_filename2 is not None:
-            self.model2 = offload.fast_load_transformers_model(model_filename2, modelClass=WanModel,do_quantize= quantizeTransformer and not save_quantized, writable_tensors= False, defaultConfigPath=base_config_file , forcedConfigPath= forcedConfigPath)

+        if self.transformer_switch:
+            shared_modules= {}
+            self.model = offload.fast_load_transformers_model(model_filename[:1], modules = model_filename[2:], modelClass=WanModel,do_quantize= quantizeTransformer and not save_quantized, writable_tensors= False, defaultConfigPath=base_config_file , forcedConfigPath= forcedConfigPath,  return_shared_modules= shared_modules)
+            self.model2 = offload.fast_load_transformers_model(model_filename[1:2], modules = shared_modules, modelClass=WanModel,do_quantize= quantizeTransformer and not save_quantized, writable_tensors= False, defaultConfigPath=base_config_file , forcedConfigPath= forcedConfigPath)
+            shared_modules = None
+        else:
+            self.model = offload.fast_load_transformers_model(model_filename, modelClass=WanModel,do_quantize= quantizeTransformer and not save_quantized, writable_tensors= False, defaultConfigPath=base_config_file , forcedConfigPath= forcedConfigPath)
+        
        # self.model = offload.load_model_data(self.model, xmodel_filename )
        # offload.load_model_data(self.model, "c:/temp/Phantom-Wan-1.3B.pth")
+
        self.model.lock_layers_dtypes(torch.float32 if mixed_precision_transformer else dtype)
        offload.change_dtype(self.model, dtype, True)
        if self.model2 is not None:
            self.model2.lock_layers_dtypes(torch.float32 if mixed_precision_transformer else dtype)
            offload.change_dtype(self.model2, dtype, True)
+
        # offload.save_model(self.model, "wan2.1_text2video_1.3B_mbf16.safetensors", do_quantize= False, config_file_path=base_config_file, filter_sd=sd)
-        # offload.save_model(self.model, "wan2.2_text2video_14B_high_mbf16.safetensors",  config_file_path=base_config_file)
-        # offload.save_model(self.model, "wan2.2_text2video_14B_high_quanto_mfp16_int8.safetensors", do_quantize=True, config_file_path=base_config_file)
+        # offload.save_model(self.model, "wan2.2_image2video_14B_low_mbf16.safetensors",  config_file_path=base_config_file)
+        # offload.save_model(self.model, "wan2.2_image2video_14B_low_quanto_mbf16_int8.safetensors", do_quantize=True, config_file_path=base_config_file)
        self.model.eval().requires_grad_(False)
+        if self.model2 is not None:
+            self.model2.eval().requires_grad_(False)
        if save_quantized:
            from wgp import save_quantized_model
            save_quantized_model(self.model, model_type, model_filename[0], dtype, base_config_file)
@ -480,8 +486,11 @@ class WanAny2V:
            any_end_frame = False
            if input_frames != None:
                _ , preframes_count, height, width = input_frames.shape
-                lat_h, lat_w = height // self.vae_stride[1], width // self.vae_stride[2]                                    
-                clip_context = self.clip.visual([input_frames[:, -1:]]) if model_type != "flf2v_720p" else self.clip.visual([input_frames[:, -1:], input_frames[:, -1:]])
+                lat_h, lat_w = height // self.vae_stride[1], width // self.vae_stride[2]
+                if hasattr(self, "clip"):                                   
+                    clip_context = self.clip.visual([input_frames[:, -1:]]) if model_type != "flf2v_720p" else self.clip.visual([input_frames[:, -1:], input_frames[:, -1:]])
+                else:
+                    clip_context = None
                input_frames = input_frames.to(device=self.device).to(dtype= self.VAE_dtype)
                enc =  torch.concat( [input_frames, torch.zeros( (3, frame_num-preframes_count, height, width), 
                                     device=self.device, dtype= self.VAE_dtype)], 
@ -513,19 +522,24 @@ class WanAny2V:
                    self.patch_size[2] * self.patch_size[2])
                h = lat_h * self.vae_stride[1]
                w = lat_w * self.vae_stride[2]
-                clip_image_size = self.clip.model.image_size
                img_interpolated = resize_lanczos(image_start, h, w).sub_(0.5).div_(0.5).unsqueeze(0).transpose(0,1).to(self.device) #, self.dtype
-                image_start = resize_lanczos(image_start, clip_image_size, clip_image_size)
-                image_start = image_start.sub_(0.5).div_(0.5).to(self.device) #, self.dtype
-                color_reference_frame = image_start.clone()
+                color_reference_frame = img_interpolated.clone()
                if image_end!= None:
                    img_interpolated2 = resize_lanczos(image_end, h, w).sub_(0.5).div_(0.5).unsqueeze(0).transpose(0,1).to(self.device) #, self.dtype
-                    image_end = resize_lanczos(image_end, clip_image_size, clip_image_size)
-                    image_end = image_end.sub_(0.5).div_(0.5).to(self.device) #, self.dtype
-                if model_type == "flf2v_720p":                    
-                    clip_context = self.clip.visual([image_start[:, None, :, :], image_end[:, None, :, :] if image_end != None else image_start[:, None, :, :]])
+
+                if hasattr(self, "clip"):                                   
+                    clip_image_size = self.clip.model.image_size
+                    image_start = resize_lanczos(image_start, clip_image_size, clip_image_size)
+                    image_start = image_start.sub_(0.5).div_(0.5).to(self.device) #, self.dtype
+                    if image_end!= None:
+                        image_end = resize_lanczos(image_end, clip_image_size, clip_image_size)
+                        image_end = image_end.sub_(0.5).div_(0.5).to(self.device) #, self.dtype
+                    if model_type == "flf2v_720p":                    
+                        clip_context = self.clip.visual([image_start[:, None, :, :], image_end[:, None, :, :] if image_end != None else image_start[:, None, :, :]])
+                    else:
+                        clip_context = self.clip.visual([image_start[:, None, :, :]])
                else:
-                    clip_context = self.clip.visual([image_start[:, None, :, :]])
+                    clip_context = None

                if any_end_frame:
                    enc= torch.concat([
@ -563,7 +577,9 @@ class WanAny2V:
                extended_overlapped_latents = lat_y[:, :overlapped_latents_frames_num].clone().unsqueeze(0)
            y = torch.concat([msk, lat_y])
            lat_y = None
-            kwargs.update({'clip_fea': clip_context, 'y': y})
+            kwargs.update({ 'y': y})
+            if not clip_context is None:
+                kwargs.update({'clip_fea': clip_context})

        # Recam Master
        if target_camera != None:
--- a/wan/modules/model.py
+++ b/wan/modules/model.py
@ -789,7 +789,7 @@ class WanModel(ModelMixin, ConfigMixin):

            sd = new_sd
        from wgp import test_class_i2v 
-        if not test_class_i2v(model_type):
+        if not test_class_i2v(model_type) or model_type in ["i2v_2_2"]:
            new_sd = {}
            # convert loras for i2v to t2v
            for k,v in sd.items():
@ -842,7 +842,7 @@ class WanModel(ModelMixin, ConfigMixin):

        super().__init__()

-        assert model_type in ['t2v', 'i2v']
+        assert model_type in ['t2v', 'i2v', 'i2v2_2']
        self.model_type = model_type

        self.patch_size = patch_size
@ -889,7 +889,7 @@ class WanModel(ModelMixin, ConfigMixin):

        # blocks
        if vace_layers == None:
-            cross_attn_type = 't2v_cross_attn' if model_type == 't2v' else 'i2v_cross_attn'
+            cross_attn_type = 't2v_cross_attn' if model_type in ['t2v','i2v2_2'] else 'i2v_cross_attn'
            self.blocks = nn.ModuleList([
                WanAttentionBlock(cross_attn_type, dim, ffn_dim, num_heads,
                                window_size, qk_norm, cross_attn_norm, eps, block_no =i, output_dim=multitalk_output_dim, norm_input_visual=norm_input_visual)
--- a/wgp.py
+++ b/wgp.py
@ -50,8 +50,8 @@ global_queue_ref = []
 AUTOSAVE_FILENAME = "queue.zip"
 PROMPT_VARS_MAX = 10

-target_mmgp_version = "3.5.1"
-WanGP_version = "7.4"
+target_mmgp_version = "3.5.3"
+WanGP_version = "7.5"
 settings_version = 2.23
 max_source_video_frames = 3000
 prompt_enhancer_image_caption_model, prompt_enhancer_image_caption_processor, prompt_enhancer_llm_model, prompt_enhancer_llm_tokenizer = None, None, None, None
@ -1581,7 +1581,7 @@ def _parse_args():

 def get_lora_dir(model_type):
    model_family = get_model_family(model_type)
-    i2v = test_class_i2v(model_type)
+    i2v = test_class_i2v(model_type) and not get_base_model_type(model_type) == "i2v_2_2"
    if model_family == "wan":
        lora_dir =args.lora_dir
        if i2v and len(lora_dir)==0:
@ -1691,7 +1691,8 @@ for path in  ["wan2.1_Vace_1.3B_preview_bf16.safetensors", "sky_reels2_diffusion
 "sky_reels2_diffusion_forcing_720p_14B_quanto_int8.safetensors", "sky_reels2_diffusion_forcing_720p_14B_quanto_fp16_int8.safetensors", "wan2.1_image2video_480p_14B_bf16.safetensors", "wan2.1_image2video_480p_14B_quanto_int8.safetensors",
 "wan2.1_image2video_720p_14B_quanto_int8.safetensors", "wan2.1_image2video_720p_14B_quanto_fp16_int8.safetensors", "wan2.1_image2video_720p_14B_bf16.safetensors",
 "wan2.1_text2video_14B_bf16.safetensors", "wan2.1_text2video_14B_quanto_int8.safetensors",
-"wan2.1_Vace_14B_mbf16.safetensors", "wan2.1_Vace_14B_quanto_mbf16_int8.safetensors", "wan2.1_FLF2V_720p_14B_quanto_int8.safetensors", "wan2.1_FLF2V_720p_14B_bf16.safetensors",  "wan2.1_FLF2V_720p_14B_fp16.safetensors", "wan2.1_Vace_1.3B_mbf16.safetensors", "wan2.1_text2video_1.3B_bf16.safetensors"
+"wan2.1_Vace_14B_mbf16.safetensors", "wan2.1_Vace_14B_quanto_mbf16_int8.safetensors", "wan2.1_FLF2V_720p_14B_quanto_int8.safetensors", "wan2.1_FLF2V_720p_14B_bf16.safetensors",  "wan2.1_FLF2V_720p_14B_fp16.safetensors", "wan2.1_Vace_1.3B_mbf16.safetensors", "wan2.1_text2video_1.3B_bf16.safetensors",
+"ltxv_0.9.7_13B_dev_bf16.safetensors"
 ]:
    if Path(os.path.join("ckpts" , path)).is_file():
        print(f"Removing old version of model '{path}'. A new version of this model will be downloaded next time you use it.")
@ -1712,7 +1713,7 @@ modules_files = {
 base_types = ["multitalk", "fantasy", "vace_14B", "vace_multitalk_14B",
                "t2v_1.3B", "t2v", "vace_1.3B", "phantom_1.3B", "phantom_14B", 
                "recam_1.3B",  "sky_df_1.3B", "sky_df_14B",
-                "i2v", "flf2v_720p", "fun_inp_1.3B", "fun_inp", "ltxv_13B",
+                "i2v", "i2v_2_2", "flf2v_720p", "fun_inp_1.3B", "fun_inp", "ltxv_13B",
                "hunyuan", "hunyuan_i2v", "hunyuan_custom", "hunyuan_custom_audio", "hunyuan_custom_edit", "hunyuan_avatar", "flux"
                ] 

@ -1792,7 +1793,7 @@ def get_model_family(model_type, for_ui = False):

 def test_class_i2v(model_type):
    model_type = get_base_model_type(model_type)
-    return model_type in ["i2v", "fun_inp_1.3B", "fun_inp", "flf2v_720p",  "fantasy",  "multitalk" ] #"hunyuan_i2v",
+    return model_type in ["i2v", "i2v_2_2", "fun_inp_1.3B", "fun_inp", "flf2v_720p",  "fantasy",  "multitalk" ] #"hunyuan_i2v",

 def test_vace_module(model_type):
    model_type = get_base_model_type(model_type)
@ -2632,7 +2633,8 @@ def load_wan_model(model_filename, model_type, base_model_type, model_def, quant
    pipe = {"transformer": wan_model.model, "text_encoder" : wan_model.text_encoder.model, "vae": wan_model.vae.model }
    if wan_model.model2 is not None:
        pipe["transformer2"] = wan_model.model2
-
+        # del pipe["transformer"]
+        # pipe["transformer"] = wan_model.model
    if hasattr(wan_model, "clip"):
        pipe["text_encoder_2"] = wan_model.clip.model
    return wan_model, pipe
@ -2803,10 +2805,9 @@ def load_models(model_type):

    if "transformer2" in pipe:
        loras_transformer += ["transformer2"]        
-        if profile in [2,4]:
+        if profile in [3,4]:
            kwargs["pinnedMemory"] = ["transformer", "transformer2"]

-
    global prompt_enhancer_image_caption_model, prompt_enhancer_image_caption_processor, prompt_enhancer_llm_model, prompt_enhancer_llm_tokenizer
    if server_config.get("enhancer_enabled", 0) == 1:
        from transformers import ( AutoModelForCausalLM, AutoProcessor, AutoTokenizer, LlamaForCausalLM )
@ -3317,11 +3318,15 @@ def select_video(state, input_file_list, event_data: gr.EventData):
                if video_length != frames_count: video_length_summary += f"real: {frames_count} frames, "
                video_length_summary += f"{frames_count/fps:.1f}s, {round(fps)} fps)"
            video_guidance_scale = configs.get("guidance_scale", None)
+            video_guidance2_scale = configs.get("guidance2_scale", None)
+            video_switch_threshold = configs.get("switch_threshold", 0)
            video_embedded_guidance_scale = configs.get("embedded_guidance_scale ", None)
            if model_family in ["hunyuan", "flux"]:
                video_guidance_scale = video_embedded_guidance_scale
                video_guidance_label = "Embedded Guidance Scale"
            else:
+                if video_switch_threshold > 0:
+                    video_guidance_scale = f"{video_guidance_scale} (High Noise), {video_guidance2_scale} (Low Noise) with Switch at Noise Level {video_switch_threshold}"
                video_guidance_label = "Guidance"
            video_flow_shift = configs.get("flow_shift", None)
            video_video_guide_outpainting = configs.get("video_guide_outpainting", "")
@ -4232,15 +4237,15 @@ def generate_video(
            loras_selected = transformer_loras_filenames + loras_selected
            loras_list_mult_choices_nums = transformer_loras_multipliers + loras_list_mult_choices_nums
            loras_slists = transformer_loras_multipliers + loras_slists
-        trans_list = [trans]
-        if trans2 is not None: trans_list += [trans2] 
-        for trans_item in trans_list:
-            offload.load_loras_into_model(trans_item, loras_selected, loras_list_mult_choices_nums, activate_all_loras=True, preprocess_sd=get_loras_preprocessor(trans, base_model_type), pinnedLora=pinnedLora, split_linear_modules_map = split_linear_modules_map) 
-            errors = trans._loras_errors
-            if len(errors) > 0:
-                error_files = [msg for _ ,  msg  in errors]
-                raise gr.Error("Error while loading Loras: " + ", ".join(error_files))
-        trans_item = trans_list = None
+
+        offload.load_loras_into_model(trans , loras_selected, loras_list_mult_choices_nums, activate_all_loras=True, preprocess_sd=get_loras_preprocessor(trans, base_model_type), pinnedLora=pinnedLora, split_linear_modules_map = split_linear_modules_map) 
+        errors = trans._loras_errors
+        if len(errors) > 0:
+            error_files = [msg for _ ,  msg  in errors]
+            raise gr.Error("Error while loading Loras: " + ", ".join(error_files))
+        if trans2 is not None: 
+            offload.sync_models_loras(trans, trans2)
+        
    seed = None if seed == -1 else seed
    # negative_prompt = "" # not applicable in the inference
    original_filename = model_filename 
@ -4619,7 +4624,9 @@ def generate_video(
                    if len(extra_process_list) == 1:
                        status_info += " and " + processes_names[extra_process_list[0]]
                    elif len(extra_process_list) == 2:
-                        status_info +=  ", " + processes_names[extra_process_list[0]] + " and " + processes_names[extra_process_list[1]]                    
+                        status_info +=  ", " + processes_names[extra_process_list[0]] + " and " + processes_names[extra_process_list[1]]
+                    if preprocess_type2 is not None:
+                            context_scale = [ control_net_weight /2, control_net_weight2 /2]
                    send_cmd("progress", [0, get_latest_status(state, status_info)])
                    video_guide_processed, video_mask_processed = preprocess_video_with_mask(video_guide, video_mask, height=image_size[0], width = image_size[1], max_frames= len(keep_frames_parsed) , start_frame = guide_start_frame, fit_canvas = sample_fit_canvas, target_fps = fps,  process_type = preprocess_type, expand_scale = mask_expand, RGB_Mask = True, negate_mask = "N" in video_prompt_type, process_outside_mask = process_outside_mask, outpainting_dims = outpainting_dims, proc_no =1 )
                    if preprocess_type2 != None:
@ -4768,6 +4775,7 @@ def generate_video(
                remove_temp_filenames(temp_filenames_list)
                offloadobj.unload_all()
                offload.unload_loras_from_model(trans)
+                if trans is not None: offload.unload_loras_from_model(trans2) 
                # if compile:
                #     cache_size = torch._dynamo.config.cache_size_limit                                      
                #     torch.compiler.reset()