From ee0bb89ee94618eac574f1290dd45532b3b59220 Mon Sep 17 00:00:00 2001
From: DeepBeepMeep <deepbeepmeep@yahoo.com>
Date: Thu, 25 Sep 2025 02:16:57 +0200
Subject: [PATCH] Added Qwen Preview mode

---
 README.md                         |  3 +-
 models/qwen/pipeline_qwenimage.py |  5 +--
 models/qwen/qwen_handler.py       |  7 ++++
 models/wan/any2video.py           | 54 ++++++++++++++-----------------
 models/wan/wan_handler.py         |  4 ++-
 shared/RGB_factors.py             |  4 +--
 6 files changed, 41 insertions(+), 36 deletions(-)

diff --git a/README.md b/README.md
index 30a97e6..9b8f70d 100644
--- a/README.md
+++ b/README.md
@@ -20,7 +20,7 @@ WanGP supports the Wan (and derived models), Hunyuan Video and LTV Video models
 **Follow DeepBeepMeep on Twitter/X to get the Latest News**: https://x.com/deepbeepmeep
 
 ## 🔥 Latest Updates : 
-### September 24 2025: WanGP v8.72 - Here Are ~~Two~~Three New Contenders in the Vace Arena !
+### September 25 2025: WanGP v8.73 - Here Are ~~Two~~Three New Contenders in the Vace Arena !
 
 So in today's release you will find two Wannabe Vace that covers each only a subset of Vace features but offers some interesting advantages:
 - **Wan 2.2 Animate**: this model is specialized in *Body Motion* and *Facial Motion transfers*. It does that very well. You can use this model to either *Replace* a person in an in Video or *Animate* the person of your choice using an existing *Pose Video* (remember *Animate Anyone* ?). By default it will keep the original soundtrack. *Wan 2.2 Animate* seems to be under the hood a derived i2v model and should support the corresponding Loras Accelerators (for instance *FusioniX t2v*). Also as a WanGP exclusivity, you will find support for *Outpainting*.
@@ -34,6 +34,7 @@ Also because I wanted to spoil you:
 
 *Update 8.71*: fixed Fast Lucy Edit that didnt contain the lora
 *Update 8.72*: shadow drop of Qwen Edit Plus
+*Update 8.73*: Qwen Preview & InfiniteTalk Start image 
 
 ### September 15 2025: WanGP v8.6 - Attack of the Clones
 
diff --git a/models/qwen/pipeline_qwenimage.py b/models/qwen/pipeline_qwenimage.py
index 85934b7..134cc51 100644
--- a/models/qwen/pipeline_qwenimage.py
+++ b/models/qwen/pipeline_qwenimage.py
@@ -971,8 +971,9 @@ class QwenImagePipeline(): #DiffusionPipeline
                     latents = latents.to(latents_dtype)
 
             if callback is not None:
-                # preview = unpack_latent(img).transpose(0,1)
-                callback(i, None, False)         
+                preview = self._unpack_latents(latents, height, width, self.vae_scale_factor)
+                preview = preview.squeeze(0)
+                callback(i, preview, False)         
 
 
         self._current_timestep = None
diff --git a/models/qwen/qwen_handler.py b/models/qwen/qwen_handler.py
index 4fcaa3b..99864a5 100644
--- a/models/qwen/qwen_handler.py
+++ b/models/qwen/qwen_handler.py
@@ -129,6 +129,7 @@ class family_handler():
                 "model_mode" : 0,
             })
 
+    @staticmethod
     def validate_generative_settings(base_model_type, model_def, inputs):
         if base_model_type in ["qwen_image_edit_20B", "qwen_image_edit_plus_20B"]:
             model_mode = inputs["model_mode"]
@@ -141,3 +142,9 @@ class family_handler():
                 gr.Info("Denoising Strength will be ignored while using Lora Inpainting")
             if outpainting_dims is not None and model_mode == 0 :
                 return "Outpainting is not supported with Masked Denoising  "
+            
+    @staticmethod
+    def get_rgb_factors(base_model_type ):
+        from shared.RGB_factors import get_rgb_factors
+        latent_rgb_factors, latent_rgb_factors_bias = get_rgb_factors("qwen")
+        return latent_rgb_factors, latent_rgb_factors_bias
diff --git a/models/wan/any2video.py b/models/wan/any2video.py
index 41d6d63..6b4ae62 100644
--- a/models/wan/any2video.py
+++ b/models/wan/any2video.py
@@ -443,38 +443,32 @@ class WanAny2V:
         # image2video 
         if model_type in ["i2v", "i2v_2_2", "fun_inp_1.3B", "fun_inp", "fantasy", "multitalk", "infinitetalk", "i2v_2_2_multitalk", "flf2v_720p"]:
             any_end_frame = False
-            if image_start is None:
-                if infinitetalk:
-                    new_shot = "Q" in video_prompt_type
-                    if input_frames is not None:
-                        image_ref = input_frames[:, 0]
-                    else:
-                        if input_ref_images is None:                        
-                            if pre_video_frame is None: raise Exception("Missing Reference Image")
-                            input_ref_images, new_shot = [pre_video_frame], False
-                        new_shot = new_shot and window_no <= len(input_ref_images)
-                        image_ref = convert_image_to_tensor(input_ref_images[ min(window_no, len(input_ref_images))-1 ])
-                    if new_shot or input_video is None:  
-                        input_video = image_ref.unsqueeze(1)
-                    else:
-                        color_correction_strength = 0 #disable color correction as transition frames between shots may have a complete different color level than the colors of the new shot
-                _ , preframes_count, height, width = input_video.shape
-                input_video = input_video.to(device=self.device).to(dtype= self.VAE_dtype)
-                if infinitetalk:
-                    image_start = image_ref.to(input_video)
-                    control_pre_frames_count = 1 
-                    control_video = image_start.unsqueeze(1)
+            if infinitetalk:
+                new_shot = "Q" in video_prompt_type
+                if input_frames is not None:
+                    image_ref = input_frames[:, 0]
                 else:
-                    image_start = input_video[:, -1]
-                    control_pre_frames_count = preframes_count
-                    control_video = input_video
-
-                color_reference_frame = image_start.unsqueeze(1).clone()
+                    if input_ref_images is None:                        
+                        if pre_video_frame is None: raise Exception("Missing Reference Image")
+                        input_ref_images, new_shot = [pre_video_frame], False
+                    new_shot = new_shot and window_no <= len(input_ref_images)
+                    image_ref = convert_image_to_tensor(input_ref_images[ min(window_no, len(input_ref_images))-1 ])
+                if new_shot or input_video is None:  
+                    input_video = image_ref.unsqueeze(1)
+                else:
+                    color_correction_strength = 0 #disable color correction as transition frames between shots may have a complete different color level than the colors of the new shot
+            _ , preframes_count, height, width = input_video.shape
+            input_video = input_video.to(device=self.device).to(dtype= self.VAE_dtype)
+            if infinitetalk:
+                image_start = image_ref.to(input_video)
+                control_pre_frames_count = 1 
+                control_video = image_start.unsqueeze(1)
             else:
-                preframes_count = control_pre_frames_count = 1                
-                height, width = image_start.shape[1:]
-                control_video = image_start.unsqueeze(1).to(self.device)
-                color_reference_frame = control_video.clone()
+                image_start = input_video[:, -1]
+                control_pre_frames_count = preframes_count
+                control_video = input_video
+
+            color_reference_frame = image_start.unsqueeze(1).clone()
 
             any_end_frame = image_end is not None 
             add_frames_for_end_image = any_end_frame and model_type == "i2v"
diff --git a/models/wan/wan_handler.py b/models/wan/wan_handler.py
index 12ddfed..574c990 100644
--- a/models/wan/wan_handler.py
+++ b/models/wan/wan_handler.py
@@ -245,8 +245,10 @@ class family_handler():
                     "visible" : False,
                 }
 
-        if vace_class or base_model_type in ["infinitetalk", "animate"]:
+        if vace_class or base_model_type in ["animate"]:
             image_prompt_types_allowed = "TVL"
+        elif base_model_type in ["infinitetalk"]:
+            image_prompt_types_allowed = "TSVL"
         elif base_model_type in ["ti2v_2_2"]:
             image_prompt_types_allowed = "TSVL"
         elif base_model_type in ["lucy_edit"]:
diff --git a/shared/RGB_factors.py b/shared/RGB_factors.py
index 6e865fa..8a870b4 100644
--- a/shared/RGB_factors.py
+++ b/shared/RGB_factors.py
@@ -1,6 +1,6 @@
 # thanks Comfyui for the rgb factors (https://github.com/comfyanonymous/ComfyUI/blob/master/comfy/latent_formats.py)
 def get_rgb_factors(model_family, model_type = None): 
-    if model_family == "wan":
+    if model_family in ["wan", "qwen"]:
         if model_type =="ti2v_2_2":
             latent_channels = 48
             latent_dimensions = 3            
@@ -261,7 +261,7 @@ def get_rgb_factors(model_family, model_type = None):
             [ 0.0249, -0.0469, -0.1703]
         ]
 
-        latent_rgb_factors_bias = [ 0.0259, -0.0192, -0.0761]        
+        latent_rgb_factors_bias = [ 0.0259, -0.0192, -0.0761]
     else:
         latent_rgb_factors_bias = latent_rgb_factors = None
     return latent_rgb_factors, latent_rgb_factors_bias
\ No newline at end of file