From ee0bb89ee94618eac574f1290dd45532b3b59220 Mon Sep 17 00:00:00 2001 From: DeepBeepMeep Date: Thu, 25 Sep 2025 02:16:57 +0200 Subject: [PATCH] Added Qwen Preview mode --- README.md | 3 +- models/qwen/pipeline_qwenimage.py | 5 +-- models/qwen/qwen_handler.py | 7 ++++ models/wan/any2video.py | 54 ++++++++++++++----------------- models/wan/wan_handler.py | 4 ++- shared/RGB_factors.py | 4 +-- 6 files changed, 41 insertions(+), 36 deletions(-) diff --git a/README.md b/README.md index 30a97e6..9b8f70d 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,7 @@ WanGP supports the Wan (and derived models), Hunyuan Video and LTV Video models **Follow DeepBeepMeep on Twitter/X to get the Latest News**: https://x.com/deepbeepmeep ## 🔥 Latest Updates : -### September 24 2025: WanGP v8.72 - Here Are ~~Two~~Three New Contenders in the Vace Arena ! +### September 25 2025: WanGP v8.73 - Here Are ~~Two~~Three New Contenders in the Vace Arena ! So in today's release you will find two Wannabe Vace that covers each only a subset of Vace features but offers some interesting advantages: - **Wan 2.2 Animate**: this model is specialized in *Body Motion* and *Facial Motion transfers*. It does that very well. You can use this model to either *Replace* a person in an in Video or *Animate* the person of your choice using an existing *Pose Video* (remember *Animate Anyone* ?). By default it will keep the original soundtrack. *Wan 2.2 Animate* seems to be under the hood a derived i2v model and should support the corresponding Loras Accelerators (for instance *FusioniX t2v*). Also as a WanGP exclusivity, you will find support for *Outpainting*. @@ -34,6 +34,7 @@ Also because I wanted to spoil you: *Update 8.71*: fixed Fast Lucy Edit that didnt contain the lora *Update 8.72*: shadow drop of Qwen Edit Plus +*Update 8.73*: Qwen Preview & InfiniteTalk Start image ### September 15 2025: WanGP v8.6 - Attack of the Clones diff --git a/models/qwen/pipeline_qwenimage.py b/models/qwen/pipeline_qwenimage.py index 85934b7..134cc51 100644 --- a/models/qwen/pipeline_qwenimage.py +++ b/models/qwen/pipeline_qwenimage.py @@ -971,8 +971,9 @@ class QwenImagePipeline(): #DiffusionPipeline latents = latents.to(latents_dtype) if callback is not None: - # preview = unpack_latent(img).transpose(0,1) - callback(i, None, False) + preview = self._unpack_latents(latents, height, width, self.vae_scale_factor) + preview = preview.squeeze(0) + callback(i, preview, False) self._current_timestep = None diff --git a/models/qwen/qwen_handler.py b/models/qwen/qwen_handler.py index 4fcaa3b..99864a5 100644 --- a/models/qwen/qwen_handler.py +++ b/models/qwen/qwen_handler.py @@ -129,6 +129,7 @@ class family_handler(): "model_mode" : 0, }) + @staticmethod def validate_generative_settings(base_model_type, model_def, inputs): if base_model_type in ["qwen_image_edit_20B", "qwen_image_edit_plus_20B"]: model_mode = inputs["model_mode"] @@ -141,3 +142,9 @@ class family_handler(): gr.Info("Denoising Strength will be ignored while using Lora Inpainting") if outpainting_dims is not None and model_mode == 0 : return "Outpainting is not supported with Masked Denoising " + + @staticmethod + def get_rgb_factors(base_model_type ): + from shared.RGB_factors import get_rgb_factors + latent_rgb_factors, latent_rgb_factors_bias = get_rgb_factors("qwen") + return latent_rgb_factors, latent_rgb_factors_bias diff --git a/models/wan/any2video.py b/models/wan/any2video.py index 41d6d63..6b4ae62 100644 --- a/models/wan/any2video.py +++ b/models/wan/any2video.py @@ -443,38 +443,32 @@ class WanAny2V: # image2video if model_type in ["i2v", "i2v_2_2", "fun_inp_1.3B", "fun_inp", "fantasy", "multitalk", "infinitetalk", "i2v_2_2_multitalk", "flf2v_720p"]: any_end_frame = False - if image_start is None: - if infinitetalk: - new_shot = "Q" in video_prompt_type - if input_frames is not None: - image_ref = input_frames[:, 0] - else: - if input_ref_images is None: - if pre_video_frame is None: raise Exception("Missing Reference Image") - input_ref_images, new_shot = [pre_video_frame], False - new_shot = new_shot and window_no <= len(input_ref_images) - image_ref = convert_image_to_tensor(input_ref_images[ min(window_no, len(input_ref_images))-1 ]) - if new_shot or input_video is None: - input_video = image_ref.unsqueeze(1) - else: - color_correction_strength = 0 #disable color correction as transition frames between shots may have a complete different color level than the colors of the new shot - _ , preframes_count, height, width = input_video.shape - input_video = input_video.to(device=self.device).to(dtype= self.VAE_dtype) - if infinitetalk: - image_start = image_ref.to(input_video) - control_pre_frames_count = 1 - control_video = image_start.unsqueeze(1) + if infinitetalk: + new_shot = "Q" in video_prompt_type + if input_frames is not None: + image_ref = input_frames[:, 0] else: - image_start = input_video[:, -1] - control_pre_frames_count = preframes_count - control_video = input_video - - color_reference_frame = image_start.unsqueeze(1).clone() + if input_ref_images is None: + if pre_video_frame is None: raise Exception("Missing Reference Image") + input_ref_images, new_shot = [pre_video_frame], False + new_shot = new_shot and window_no <= len(input_ref_images) + image_ref = convert_image_to_tensor(input_ref_images[ min(window_no, len(input_ref_images))-1 ]) + if new_shot or input_video is None: + input_video = image_ref.unsqueeze(1) + else: + color_correction_strength = 0 #disable color correction as transition frames between shots may have a complete different color level than the colors of the new shot + _ , preframes_count, height, width = input_video.shape + input_video = input_video.to(device=self.device).to(dtype= self.VAE_dtype) + if infinitetalk: + image_start = image_ref.to(input_video) + control_pre_frames_count = 1 + control_video = image_start.unsqueeze(1) else: - preframes_count = control_pre_frames_count = 1 - height, width = image_start.shape[1:] - control_video = image_start.unsqueeze(1).to(self.device) - color_reference_frame = control_video.clone() + image_start = input_video[:, -1] + control_pre_frames_count = preframes_count + control_video = input_video + + color_reference_frame = image_start.unsqueeze(1).clone() any_end_frame = image_end is not None add_frames_for_end_image = any_end_frame and model_type == "i2v" diff --git a/models/wan/wan_handler.py b/models/wan/wan_handler.py index 12ddfed..574c990 100644 --- a/models/wan/wan_handler.py +++ b/models/wan/wan_handler.py @@ -245,8 +245,10 @@ class family_handler(): "visible" : False, } - if vace_class or base_model_type in ["infinitetalk", "animate"]: + if vace_class or base_model_type in ["animate"]: image_prompt_types_allowed = "TVL" + elif base_model_type in ["infinitetalk"]: + image_prompt_types_allowed = "TSVL" elif base_model_type in ["ti2v_2_2"]: image_prompt_types_allowed = "TSVL" elif base_model_type in ["lucy_edit"]: diff --git a/shared/RGB_factors.py b/shared/RGB_factors.py index 6e865fa..8a870b4 100644 --- a/shared/RGB_factors.py +++ b/shared/RGB_factors.py @@ -1,6 +1,6 @@ # thanks Comfyui for the rgb factors (https://github.com/comfyanonymous/ComfyUI/blob/master/comfy/latent_formats.py) def get_rgb_factors(model_family, model_type = None): - if model_family == "wan": + if model_family in ["wan", "qwen"]: if model_type =="ti2v_2_2": latent_channels = 48 latent_dimensions = 3 @@ -261,7 +261,7 @@ def get_rgb_factors(model_family, model_type = None): [ 0.0249, -0.0469, -0.1703] ] - latent_rgb_factors_bias = [ 0.0259, -0.0192, -0.0761] + latent_rgb_factors_bias = [ 0.0259, -0.0192, -0.0761] else: latent_rgb_factors_bias = latent_rgb_factors = None return latent_rgb_factors, latent_rgb_factors_bias \ No newline at end of file