From 76f86c43ad12914626cf6d3ad623ba1f6f551640 Mon Sep 17 00:00:00 2001 From: DeepBeepMeep Date: Wed, 24 Sep 2025 23:38:27 +0200 Subject: [PATCH] add qwen edit plus support --- README.md | 6 +- defaults/qwen_image_edit_plus_20B.json | 17 +++ models/flux/flux_handler.py | 2 +- models/qwen/pipeline_qwenimage.py | 161 ++++++++++++++----------- models/qwen/qwen_handler.py | 21 +++- models/qwen/qwen_main.py | 21 ++-- preprocessing/matanyone/app.py | 4 +- shared/utils/utils.py | 4 +- wgp.py | 35 ++++-- 9 files changed, 170 insertions(+), 101 deletions(-) create mode 100644 defaults/qwen_image_edit_plus_20B.json diff --git a/README.md b/README.md index 8cee834..b8c92b6 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,7 @@ WanGP supports the Wan (and derived models), Hunyuan Video and LTV Video models **Follow DeepBeepMeep on Twitter/X to get the Latest News**: https://x.com/deepbeepmeep ## 🔥 Latest Updates : -### September 23 2025: WanGP v8.7 - Here Are Two New Contenders in the Vace Arena ! +### September 24 2025: WanGP v8.72 - Here Are ~~Two~~Three New Contenders in the Vace Arena ! So in today's release you will find two Wannabe Vace that covers each only a subset of Vace features but offers some interesting advantages: - **Wan 2.2 Animate**: this model is specialized in *Body Motion* and *Facial Motion transfers*. It does that very well. You can either *Replace* a person in a Video or *Animate* the person of your choice using an existing *Pose Video* (remember *Animate Anyone* ?). By default it will keep the original soundtrack. *Wan 2.2 Animate* seems to be under the hood a derived i2v model and should support the corresponding Loras Accelerators (for instance *FusioniX i2v*). Also as a WanGP exclusivity, you will find support for *Outpainting*. @@ -29,7 +29,11 @@ In order to use Wan 2.2 Animate you will need first to stop by the *Mat Anyone* - **Lucy Edit**: this one claims to be a *Nano Banana* for Videos. Give it a video and asks it to change it (it is specialized in clothes changing) and voila ! The nice thing about it is that is it based on the *Wan 2.2 5B* model and therefore is very fast especially if you the *FastWan* finetune that is also part of the package. +Also because I wanted to spoil you: +- **Qwen Edit Plus**: also known as the *Qwen Edit 25th September Update* which is specialized in combining multiple Objects / People. There is also a new support for *Pose transfer* & *Recolorisation*. All of this made easy to use in WanGP. You will find right now only the quantized version since HF crashes when uploading the unquantized version. + *Update 8.71*: fixed Fast Lucy Edit that didnt contain the lora +*Update 8.72*: shadow drop of Qwen Edit Plus ### September 15 2025: WanGP v8.6 - Attack of the Clones diff --git a/defaults/qwen_image_edit_plus_20B.json b/defaults/qwen_image_edit_plus_20B.json new file mode 100644 index 0000000..e10deb2 --- /dev/null +++ b/defaults/qwen_image_edit_plus_20B.json @@ -0,0 +1,17 @@ +{ + "model": { + "name": "Qwen Image Edit Plus 20B", + "architecture": "qwen_image_edit_plus_20B", + "description": "Qwen Image Edit Plus is a generative model that can generate very high quality images with long texts in it. Best results will be at 720p. This model is optimized to combine multiple Subjects & Objects.", + "URLs": [ + "https://huggingface.co/DeepBeepMeep/Qwen_image/resolve/main/qwen_image_edit_plus_20B_quanto_bf16_int8.safetensors" + ], + "preload_URLs": "qwen_image_edit_20B", + "attention": { + "<89": "sdpa" + } + }, + "prompt": "add a hat", + "resolution": "1024x1024", + "batch_size": 1 +} \ No newline at end of file diff --git a/models/flux/flux_handler.py b/models/flux/flux_handler.py index 83de7c3..471c339 100644 --- a/models/flux/flux_handler.py +++ b/models/flux/flux_handler.py @@ -28,7 +28,7 @@ class family_handler(): extra_model_def["any_image_refs_relative_size"] = True extra_model_def["no_background_removal"] = True extra_model_def["image_ref_choices"] = { - "choices":[("No Reference Image", ""),("First Image is a Reference Image, and then the next ones (up to two) are Style Images", "KI"), + "choices":[("First Image is a Reference Image, and then the next ones (up to two) are Style Images", "KI"), ("Up to two Images are Style Images", "KIJ")], "default": "KI", "letters_filter": "KIJ", diff --git a/models/qwen/pipeline_qwenimage.py b/models/qwen/pipeline_qwenimage.py index 1472886..85934b7 100644 --- a/models/qwen/pipeline_qwenimage.py +++ b/models/qwen/pipeline_qwenimage.py @@ -200,7 +200,8 @@ class QwenImagePipeline(): #DiffusionPipeline self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor * 2) self.tokenizer_max_length = 1024 if processor is not None: - self.prompt_template_encode = "<|im_start|>system\nDescribe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>{}<|im_end|>\n<|im_start|>assistant\n" + # self.prompt_template_encode = "<|im_start|>system\nDescribe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>{}<|im_end|>\n<|im_start|>assistant\n" + self.prompt_template_encode = "<|im_start|>system\nDescribe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n" self.prompt_template_encode_start_idx = 64 else: self.prompt_template_encode = "<|im_start|>system\nDescribe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n" @@ -232,6 +233,21 @@ class QwenImagePipeline(): #DiffusionPipeline txt = [template.format(e) for e in prompt] if self.processor is not None and image is not None: + img_prompt_template = "Picture {}: <|vision_start|><|image_pad|><|vision_end|>" + if isinstance(image, list): + base_img_prompt = "" + for i, img in enumerate(image): + base_img_prompt += img_prompt_template.format(i + 1) + elif image is not None: + base_img_prompt = img_prompt_template.format(1) + else: + base_img_prompt = "" + + template = self.prompt_template_encode + + drop_idx = self.prompt_template_encode_start_idx + txt = [template.format(base_img_prompt + e) for e in prompt] + model_inputs = self.processor( text=txt, images=image, @@ -464,7 +480,7 @@ class QwenImagePipeline(): #DiffusionPipeline def prepare_latents( self, - image, + images, batch_size, num_channels_latents, height, @@ -482,24 +498,30 @@ class QwenImagePipeline(): #DiffusionPipeline shape = (batch_size, num_channels_latents, 1, height, width) image_latents = None - if image is not None: - image = image.to(device=device, dtype=dtype) - if image.shape[1] != self.latent_channels: - image_latents = self._encode_vae_image(image=image, generator=generator) - else: - image_latents = image - if batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] == 0: - # expand init_latents for batch_size - additional_image_per_prompt = batch_size // image_latents.shape[0] - image_latents = torch.cat([image_latents] * additional_image_per_prompt, dim=0) - elif batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] != 0: - raise ValueError( - f"Cannot duplicate `image` of batch size {image_latents.shape[0]} to {batch_size} text prompts." - ) - else: - image_latents = torch.cat([image_latents], dim=0) + if images is not None and len(images ) > 0: + if not isinstance(images, list): + images = [images] + all_image_latents = [] + for image in images: + image = image.to(device=device, dtype=dtype) + if image.shape[1] != self.latent_channels: + image_latents = self._encode_vae_image(image=image, generator=generator) + else: + image_latents = image + if batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] == 0: + # expand init_latents for batch_size + additional_image_per_prompt = batch_size // image_latents.shape[0] + image_latents = torch.cat([image_latents] * additional_image_per_prompt, dim=0) + elif batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] != 0: + raise ValueError( + f"Cannot duplicate `image` of batch size {image_latents.shape[0]} to {batch_size} text prompts." + ) + else: + image_latents = torch.cat([image_latents], dim=0) - image_latents = self._pack_latents(image_latents) + image_latents = self._pack_latents(image_latents) + all_image_latents.append(image_latents) + image_latents = torch.cat(all_image_latents, dim=1) if isinstance(generator, list) and len(generator) != batch_size: raise ValueError( @@ -568,6 +590,7 @@ class QwenImagePipeline(): #DiffusionPipeline joint_pass= True, lora_inpaint = False, outpainting_dims = None, + qwen_edit_plus = False, ): r""" Function invoked when calling the pipeline for generation. @@ -683,61 +706,54 @@ class QwenImagePipeline(): #DiffusionPipeline batch_size = prompt_embeds.shape[0] device = "cuda" - prompt_image = None + condition_images = [] + vae_image_sizes = [] + vae_images = [] image_mask_latents = None - if image is not None and not (isinstance(image, torch.Tensor) and image.size(1) == self.latent_channels): - image = image[0] if isinstance(image, list) else image - image_height, image_width = self.image_processor.get_default_height_width(image) - aspect_ratio = image_width / image_height - if False : - _, image_width, image_height = min( - (abs(aspect_ratio - w / h), w, h) for w, h in PREFERRED_QWENIMAGE_RESOLUTIONS - ) - image_width = image_width // multiple_of * multiple_of - image_height = image_height // multiple_of * multiple_of - ref_height, ref_width = 1568, 672 + ref_size = 1024 + ref_text_encoder_size = 384 if qwen_edit_plus else 1024 + if image is not None: + if not isinstance(image, list): image = [image] + if height * width < ref_size * ref_size: ref_size = round(math.sqrt(height * width)) + for ref_no, img in enumerate(image): + image_width, image_height = img.size + any_mask = ref_no == 0 and image_mask is not None + if (image_height * image_width > ref_size * ref_size) and not any_mask: + vae_height, vae_width =calculate_new_dimensions(ref_size, ref_size, image_height, image_width, False, block_size=multiple_of) + else: + vae_height, vae_width = image_height, image_width + vae_width = vae_width // multiple_of * multiple_of + vae_height = vae_height // multiple_of * multiple_of + vae_image_sizes.append((vae_width, vae_height)) + condition_height, condition_width =calculate_new_dimensions(ref_text_encoder_size, ref_text_encoder_size, image_height, image_width, False, block_size=multiple_of) + condition_images.append(img.resize((condition_width, condition_height), resample=Image.Resampling.LANCZOS) ) + if img.size != (vae_width, vae_height): + img = img.resize((vae_width, vae_height), resample=Image.Resampling.LANCZOS) + if any_mask : + if lora_inpaint: + image_mask_rebuilt = torch.where(convert_image_to_tensor(image_mask)>-0.5, 1., 0. )[0:1] + img = convert_image_to_tensor(img) + green = torch.tensor([-1.0, 1.0, -1.0]).to(img) + green_image = green[:, None, None] .expand_as(img) + img = torch.where(image_mask_rebuilt > 0, green_image, img) + img = convert_tensor_to_image(img) + else: + image_mask_latents = convert_image_to_tensor(image_mask.resize((vae_width // 8, vae_height // 8), resample=Image.Resampling.LANCZOS)) + image_mask_latents = torch.where(image_mask_latents>-0.5, 1., 0. )[0:1] + image_mask_rebuilt = image_mask_latents.repeat_interleave(8, dim=-1).repeat_interleave(8, dim=-2).unsqueeze(0) + # convert_tensor_to_image( image_mask_rebuilt.squeeze(0).repeat(3,1,1)).save("mmm.png") + image_mask_latents = image_mask_latents.to(device).unsqueeze(0).unsqueeze(0).repeat(1,16,1,1,1) + image_mask_latents = self._pack_latents(image_mask_latents) + # img.save("nnn.png") + vae_images.append( convert_image_to_tensor(img).unsqueeze(0).unsqueeze(2) ) - if image_mask is None: - if height * width < ref_height * ref_width: ref_height , ref_width = height , width - if image_height * image_width > ref_height * ref_width: - image_height, image_width = calculate_new_dimensions(ref_height, ref_width, image_height, image_width, False, block_size=multiple_of) - if (image_width,image_height) != image.size: - image = image.resize((image_width,image_height), resample=Image.Resampling.LANCZOS) - elif not lora_inpaint: - # _, image_width, image_height = min( - # (abs(aspect_ratio - w / h), w, h) for w, h in PREFERRED_QWENIMAGE_RESOLUTIONS - # ) - image_height, image_width = calculate_new_dimensions(height, width, image_height, image_width, False, block_size=multiple_of) - # image_height, image_width = calculate_new_dimensions(ref_height, ref_width, image_height, image_width, False, block_size=multiple_of) - height, width = image_height, image_width - image_mask_latents = convert_image_to_tensor(image_mask.resize((width // 8, height // 8), resample=Image.Resampling.LANCZOS)) - image_mask_latents = torch.where(image_mask_latents>-0.5, 1., 0. )[0:1] - image_mask_rebuilt = image_mask_latents.repeat_interleave(8, dim=-1).repeat_interleave(8, dim=-2).unsqueeze(0) - # convert_tensor_to_image( image_mask_rebuilt.squeeze(0).repeat(3,1,1)).save("mmm.png") - image_mask_latents = image_mask_latents.to(device).unsqueeze(0).unsqueeze(0).repeat(1,16,1,1,1) - image_mask_latents = self._pack_latents(image_mask_latents) - - prompt_image = image - if image.size != (image_width, image_height): - image = image.resize((image_width, image_height), resample=Image.Resampling.LANCZOS) - - image = convert_image_to_tensor(image) - if lora_inpaint: - image_mask_rebuilt = torch.where(convert_image_to_tensor(image_mask)>-0.5, 1., 0. )[0:1] - image_mask_latents = None - green = torch.tensor([-1.0, 1.0, -1.0]).to(image) - green_image = green[:, None, None] .expand_as(image) - image = torch.where(image_mask_rebuilt > 0, green_image, image) - prompt_image = convert_tensor_to_image(image) - image = image.unsqueeze(0).unsqueeze(2) - # image.save("nnn.png") has_neg_prompt = negative_prompt is not None or ( negative_prompt_embeds is not None and negative_prompt_embeds_mask is not None ) do_true_cfg = true_cfg_scale > 1 and has_neg_prompt prompt_embeds, prompt_embeds_mask = self.encode_prompt( - image=prompt_image, + image=condition_images, prompt=prompt, prompt_embeds=prompt_embeds, prompt_embeds_mask=prompt_embeds_mask, @@ -747,7 +763,7 @@ class QwenImagePipeline(): #DiffusionPipeline ) if do_true_cfg: negative_prompt_embeds, negative_prompt_embeds_mask = self.encode_prompt( - image=prompt_image, + image=condition_images, prompt=negative_prompt, prompt_embeds=negative_prompt_embeds, prompt_embeds_mask=negative_prompt_embeds_mask, @@ -763,7 +779,7 @@ class QwenImagePipeline(): #DiffusionPipeline # 4. Prepare latent variables num_channels_latents = self.transformer.in_channels // 4 latents, image_latents = self.prepare_latents( - image, + vae_images, batch_size * num_images_per_prompt, num_channels_latents, height, @@ -779,7 +795,12 @@ class QwenImagePipeline(): #DiffusionPipeline img_shapes = [ [ (1, height // self.vae_scale_factor // 2, width // self.vae_scale_factor // 2), - (1, image_height // self.vae_scale_factor // 2, image_width // self.vae_scale_factor // 2), + # (1, image_height // self.vae_scale_factor // 2, image_width // self.vae_scale_factor // 2), + *[ + (1, vae_height // self.vae_scale_factor // 2, vae_width // self.vae_scale_factor // 2) + for vae_width, vae_height in vae_image_sizes + ], + ] ] * batch_size else: @@ -971,7 +992,7 @@ class QwenImagePipeline(): #DiffusionPipeline latents = latents / latents_std + latents_mean output_image = self.vae.decode(latents, return_dict=False)[0][:, :, 0] if image_mask is not None and not lora_inpaint : #not (lora_inpaint and outpainting_dims is not None): - output_image = image.squeeze(2) * (1 - image_mask_rebuilt) + output_image.to(image) * image_mask_rebuilt + output_image = vae_images[0].squeeze(2) * (1 - image_mask_rebuilt) + output_image.to(vae_images[0] ) * image_mask_rebuilt return output_image diff --git a/models/qwen/qwen_handler.py b/models/qwen/qwen_handler.py index cc6a764..4fcaa3b 100644 --- a/models/qwen/qwen_handler.py +++ b/models/qwen/qwen_handler.py @@ -20,7 +20,7 @@ class family_handler(): "fit_into_canvas_image_refs": 0, } - if base_model_type in ["qwen_image_edit_20B"]: + if base_model_type in ["qwen_image_edit_20B", "qwen_image_edit_plus_20B"]: extra_model_def["inpaint_support"] = True extra_model_def["image_ref_choices"] = { "choices": [ @@ -42,11 +42,20 @@ class family_handler(): "image_modes" : [2], } + if base_model_type in ["qwen_image_edit_plus_20B"]: + extra_model_def["guide_preprocessing"] = { + "selection": ["", "PV", "SV", "CV"], + } + + extra_model_def["mask_preprocessing"] = { + "selection": ["", "A"], + "visible": False, + } return extra_model_def @staticmethod def query_supported_types(): - return ["qwen_image_20B", "qwen_image_edit_20B"] + return ["qwen_image_20B", "qwen_image_edit_20B", "qwen_image_edit_plus_20B"] @staticmethod def query_family_maps(): @@ -113,9 +122,15 @@ class family_handler(): "denoising_strength" : 1., "model_mode" : 0, }) + elif base_model_type in ["qwen_image_edit_plus_20B"]: + ui_defaults.update({ + "video_prompt_type": "I", + "denoising_strength" : 1., + "model_mode" : 0, + }) def validate_generative_settings(base_model_type, model_def, inputs): - if base_model_type in ["qwen_image_edit_20B"]: + if base_model_type in ["qwen_image_edit_20B", "qwen_image_edit_plus_20B"]: model_mode = inputs["model_mode"] denoising_strength= inputs["denoising_strength"] video_guide_outpainting= inputs["video_guide_outpainting"] diff --git a/models/qwen/qwen_main.py b/models/qwen/qwen_main.py index abd5c5c..8c4514f 100644 --- a/models/qwen/qwen_main.py +++ b/models/qwen/qwen_main.py @@ -51,10 +51,10 @@ class model_factory(): transformer_filename = model_filename[0] processor = None tokenizer = None - if base_model_type == "qwen_image_edit_20B": + if base_model_type in ["qwen_image_edit_20B", "qwen_image_edit_plus_20B"]: processor = Qwen2VLProcessor.from_pretrained(os.path.join(checkpoint_dir,"Qwen2.5-VL-7B-Instruct")) tokenizer = AutoTokenizer.from_pretrained(os.path.join(checkpoint_dir,"Qwen2.5-VL-7B-Instruct")) - + self.base_model_type = base_model_type base_config_file = "configs/qwen_image_20B.json" with open(base_config_file, 'r', encoding='utf-8') as f: @@ -173,7 +173,7 @@ class model_factory(): self.vae.tile_latent_min_height = VAE_tile_size[1] self.vae.tile_latent_min_width = VAE_tile_size[1] - + qwen_edit_plus = self.base_model_type in ["qwen_image_edit_plus_20B"] self.vae.enable_slicing() # width, height = aspect_ratios["16:9"] @@ -182,17 +182,19 @@ class model_factory(): image_mask = None if input_masks is None else convert_tensor_to_image(input_masks, mask_levels= True) if input_frames is not None: - input_ref_images = [convert_tensor_to_image(input_frames) ] - elif input_ref_images is not None: + input_ref_images = [convert_tensor_to_image(input_frames) ] + ([] if input_ref_images is None else input_ref_images ) + + if input_ref_images is not None: # image stiching method stiched = input_ref_images[0] if "K" in video_prompt_type : w, h = input_ref_images[0].size height, width = calculate_new_dimensions(height, width, h, w, fit_into_canvas) - for new_img in input_ref_images[1:]: - stiched = stitch_images(stiched, new_img) - input_ref_images = [stiched] + if not qwen_edit_plus: + for new_img in input_ref_images[1:]: + stiched = stitch_images(stiched, new_img) + input_ref_images = [stiched] image = self.pipeline( prompt=input_prompt, @@ -212,7 +214,8 @@ class model_factory(): generator=torch.Generator(device="cuda").manual_seed(seed), lora_inpaint = image_mask is not None and model_mode == 1, outpainting_dims = outpainting_dims, - ) + qwen_edit_plus = qwen_edit_plus, + ) if image is None: return None return image.transpose(0, 1) diff --git a/preprocessing/matanyone/app.py b/preprocessing/matanyone/app.py index d40811d..df71b40 100644 --- a/preprocessing/matanyone/app.py +++ b/preprocessing/matanyone/app.py @@ -21,6 +21,7 @@ from .utils.get_default_model import get_matanyone_model from .matanyone.inference.inference_core import InferenceCore from .matanyone_wrapper import matanyone from shared.utils.audio_video import save_video, save_image +from mmgp import offload arg_device = "cuda" arg_sam_model_type="vit_h" @@ -539,7 +540,7 @@ def video_matting(video_state,video_input, end_slider, matting_type, interactive file_name = ".".join(file_name.split(".")[:-1]) from shared.utils.audio_video import extract_audio_tracks, combine_video_with_audio_tracks, cleanup_temp_audio_files - source_audio_tracks, audio_metadata = extract_audio_tracks(video_input) + source_audio_tracks, audio_metadata = extract_audio_tracks(video_input, verbose= offload.default_verboseLevel ) output_fg_path = f"./mask_outputs/{file_name}_fg.mp4" output_fg_temp_path = f"./mask_outputs/{file_name}_fg_tmp.mp4" if len(source_audio_tracks) == 0: @@ -679,7 +680,6 @@ def load_unload_models(selected): } # os.path.join('.') - from mmgp import offload # sam_checkpoint = load_file_from_url(sam_checkpoint_url_dict[arg_sam_model_type], ".") sam_checkpoint = None diff --git a/shared/utils/utils.py b/shared/utils/utils.py index bb2d5ff..00178aa 100644 --- a/shared/utils/utils.py +++ b/shared/utils/utils.py @@ -321,7 +321,7 @@ def fit_image_into_canvas(ref_img, image_size, canvas_tf_bg =127.5, device ="cpu ref_width, ref_height = ref_img.size if (ref_height, ref_width) == image_size and outpainting_dims == None: ref_img = TF.to_tensor(ref_img).sub_(0.5).div_(0.5).unsqueeze(1) - canvas = torch.zeros_like(ref_img) if return_mask else None + canvas = torch.zeros_like(ref_img[:1]) if return_mask else None else: if outpainting_dims != None: final_height, final_width = image_size @@ -374,7 +374,7 @@ def prepare_video_guide_and_mask( video_guides, video_masks, pre_video_guide, im if pre_video_guide is not None: src_video = pre_video_guide if src_video is None else torch.cat( [pre_video_guide, src_video], dim=1) if any_mask: - src_mask = torch.zeros_like(pre_video_guide[0:1]) if src_mask is None else torch.cat( [torch.zeros_like(pre_video_guide[0:1]), src_mask], dim=1) + src_mask = torch.zeros_like(pre_video_guide[:1]) if src_mask is None else torch.cat( [torch.zeros_like(pre_video_guide[:1]), src_mask], dim=1) if any_guide_padding: if src_video is None: diff --git a/wgp.py b/wgp.py index d48c7fa..f92cf33 100644 --- a/wgp.py +++ b/wgp.py @@ -63,7 +63,7 @@ AUTOSAVE_FILENAME = "queue.zip" PROMPT_VARS_MAX = 10 target_mmgp_version = "3.6.0" -WanGP_version = "8.71" +WanGP_version = "8.72" settings_version = 2.35 max_source_video_frames = 3000 prompt_enhancer_image_caption_model, prompt_enhancer_image_caption_processor, prompt_enhancer_llm_model, prompt_enhancer_llm_tokenizer = None, None, None, None @@ -4987,7 +4987,7 @@ def generate_video( frames_to_inject[pos] = image_refs[i] - video_guide_processed = video_mask_processed = video_guide_processed2 = video_mask_processed2 = None + video_guide_processed = video_mask_processed = video_guide_processed2 = video_mask_processed2 = sparse_video_image = None if video_guide is not None: keep_frames_parsed_full, error = parse_keep_frames_video_guide(keep_frames_video_guide, source_video_frames_count -source_video_overlap_frames_count + requested_frames_to_generate) if len(error) > 0: @@ -6566,11 +6566,11 @@ def switch_image_mode(state): inpaint_support = model_def.get("inpaint_support", False) if inpaint_support: if image_mode == 1: - video_prompt_type = del_in_sequence(video_prompt_type, "VAG") + video_prompt_type = del_in_sequence(video_prompt_type, "VAG" + all_guide_processes) video_prompt_type = add_to_sequence(video_prompt_type, "KI") elif image_mode == 2: + video_prompt_type = del_in_sequence(video_prompt_type, "KI" + all_guide_processes) video_prompt_type = add_to_sequence(video_prompt_type, "VAG") - video_prompt_type = del_in_sequence(video_prompt_type, "KI") ui_defaults["video_prompt_type"] = video_prompt_type return str(time.time()) @@ -6965,10 +6965,11 @@ def refresh_video_prompt_type_alignment(state, video_prompt_type, video_prompt_t video_prompt_type = add_to_sequence(video_prompt_type, video_prompt_type_video_guide) return video_prompt_type +all_guide_processes ="PDESLCMUVB" def refresh_video_prompt_type_video_guide(state, video_prompt_type, video_prompt_type_video_guide, image_mode, old_image_mask_guide_value, old_image_guide_value, old_image_mask_value ): old_video_prompt_type = video_prompt_type - video_prompt_type = del_in_sequence(video_prompt_type, "PDESLCMUVB") + video_prompt_type = del_in_sequence(video_prompt_type, all_guide_processes) video_prompt_type = add_to_sequence(video_prompt_type, video_prompt_type_video_guide) visible = "V" in video_prompt_type model_type = state["model_type"] @@ -6978,8 +6979,12 @@ def refresh_video_prompt_type_video_guide(state, video_prompt_type, video_prompt image_outputs = image_mode > 0 keep_frames_video_guide_visible = not image_outputs and visible and not model_def.get("keep_frames_video_guide_not_supported", False) image_mask_guide, image_guide, image_mask = switch_image_guide_editor(image_mode, old_video_prompt_type , video_prompt_type, old_image_mask_guide_value, old_image_guide_value, old_image_mask_value ) - - return video_prompt_type, gr.update(visible = visible and not image_outputs), image_guide, gr.update(visible = keep_frames_video_guide_visible), gr.update(visible = visible and "G" in video_prompt_type), gr.update(visible= (visible or "F" in video_prompt_type or "K" in video_prompt_type) and any_outpainting), gr.update(visible= visible and not "U" in video_prompt_type ), gr.update(visible= mask_visible and not image_outputs), image_mask, image_mask_guide, gr.update(visible= mask_visible) + mask_preprocessing = model_def.get("mask_preprocessing", None) + if mask_preprocessing is not None: + mask_selector_visible = mask_preprocessing.get("visible", True) + else: + mask_selector_visible = True + return video_prompt_type, gr.update(visible = visible and not image_outputs), image_guide, gr.update(visible = keep_frames_video_guide_visible), gr.update(visible = visible and "G" in video_prompt_type), gr.update(visible= (visible or "F" in video_prompt_type or "K" in video_prompt_type) and any_outpainting), gr.update(visible= visible and mask_selector_visible and not "U" in video_prompt_type ) , gr.update(visible= mask_visible and not image_outputs), image_mask, image_mask_guide, gr.update(visible= mask_visible) def refresh_video_prompt_type_video_guide_alt(state, video_prompt_type, video_prompt_type_video_guide_alt, image_mode): @@ -7391,7 +7396,7 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non any_start_image = any_end_image = any_reference_image = any_image_mask = False v2i_switch_supported = (vace or t2v or standin) and not image_outputs ti2v_2_2 = base_model_type in ["ti2v_2_2"] - gallery_height = 350 + gallery_height = 550 def get_image_gallery(label ="", value = None, single_image_mode = False, visible = False ): with gr.Row(visible = visible) as gallery_row: gallery_amg = AdvancedMediaGallery(media_mode="image", height=gallery_height, columns=4, label=label, initial = value , single_image_mode = single_image_mode ) @@ -7459,8 +7464,12 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non keep_frames_video_source = gr.Text(value=ui_defaults.get("keep_frames_video_source","") , visible= len(filter_letters(image_prompt_type_value, "VL"))>0 , scale = 2, label= "Truncate Video beyond this number of resampled Frames (empty=Keep All, negative truncates from End)" ) any_control_video = any_control_image = False - guide_preprocessing = model_def.get("guide_preprocessing", None) - mask_preprocessing = model_def.get("mask_preprocessing", None) + if image_mode_value ==2: + guide_preprocessing = { "selection": ["V", "VG"]} + mask_preprocessing = { "selection": ["A"]} + else: + guide_preprocessing = model_def.get("guide_preprocessing", None) + mask_preprocessing = model_def.get("mask_preprocessing", None) guide_custom_choices = model_def.get("guide_custom_choices", None) image_ref_choices = model_def.get("image_ref_choices", None) @@ -7504,7 +7513,7 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non if image_outputs: video_prompt_type_video_guide_label = video_prompt_type_video_guide_label.replace("Video", "Image") video_prompt_type_video_guide = gr.Dropdown( guide_preprocessing_choices, - value=filter_letters(video_prompt_type_value, "PDESLCMUVB", guide_preprocessing.get("default", "") ), + value=filter_letters(video_prompt_type_value, all_guide_processes, guide_preprocessing.get("default", "") ), label= video_prompt_type_video_guide_label , scale = 2, visible= guide_preprocessing.get("visible", True) , show_label= True, ) any_control_video = True @@ -7590,8 +7599,8 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non if image_guide_value is None: image_mask_guide_value = None else: - image_mask_value = rgb_bw_to_rgba_mask(image_mask_value) - image_mask_guide_value = { "background" : image_guide_value, "composite" : None, "layers": [image_mask_value] } + image_mask_guide_value = { "background" : image_guide_value, "composite" : None} + image_mask_guide_value["layers"] = [] if image_mask_value is None else [rgb_bw_to_rgba_mask(image_mask_value)] image_mask_guide = gr.ImageEditor( label="Control Image to be Inpainted" if image_mode_value == 2 else "Control Image and Mask",