mirror of
				https://github.com/Wan-Video/Wan2.1.git
				synced 2025-11-04 06:15:17 +00:00 
			
		
		
		
	add qwen edit plus support
This commit is contained in:
		
							parent
							
								
									2cbcb9523e
								
							
						
					
					
						commit
						76f86c43ad
					
				@ -20,7 +20,7 @@ WanGP supports the Wan (and derived models), Hunyuan Video and LTV Video models
 | 
			
		||||
**Follow DeepBeepMeep on Twitter/X to get the Latest News**: https://x.com/deepbeepmeep
 | 
			
		||||
 | 
			
		||||
## 🔥 Latest Updates : 
 | 
			
		||||
### September 23 2025: WanGP v8.7 - Here Are Two New Contenders in the Vace Arena !
 | 
			
		||||
### September 24 2025: WanGP v8.72 - Here Are ~~Two~~Three New Contenders in the Vace Arena !
 | 
			
		||||
 | 
			
		||||
So in today's release you will find two Wannabe Vace that covers each only a subset of Vace features but offers some interesting advantages:
 | 
			
		||||
- **Wan 2.2 Animate**: this model is specialized in *Body Motion* and *Facial Motion transfers*. It does that very well. You can either *Replace* a person in a Video or *Animate* the person of your choice using an existing *Pose Video* (remember *Animate Anyone* ?). By default it will keep the original soundtrack. *Wan 2.2 Animate* seems to be under the hood a derived i2v model and should support the corresponding Loras Accelerators (for instance *FusioniX i2v*). Also as a WanGP exclusivity, you will find support for *Outpainting*.
 | 
			
		||||
@ -29,7 +29,11 @@ In order to use Wan 2.2 Animate you will need first to stop by the *Mat Anyone*
 | 
			
		||||
 | 
			
		||||
- **Lucy Edit**: this one claims to be a *Nano Banana* for Videos. Give it a video and asks it to change it (it is specialized in clothes changing) and voila ! The nice thing about it is that is it based on the *Wan 2.2 5B* model and therefore is very fast especially if you the *FastWan* finetune that is also part of the package.
 | 
			
		||||
 | 
			
		||||
Also because I wanted to spoil you:
 | 
			
		||||
- **Qwen Edit Plus**: also known as the *Qwen Edit 25th September Update* which is specialized in combining multiple Objects / People. There is also a new support for *Pose transfer* & *Recolorisation*. All of this made easy to use in WanGP. You will find right now only the quantized version since HF crashes when uploading the unquantized version.
 | 
			
		||||
 | 
			
		||||
*Update 8.71*: fixed Fast Lucy Edit that didnt contain the lora
 | 
			
		||||
*Update 8.72*: shadow drop of Qwen Edit Plus
 | 
			
		||||
 | 
			
		||||
### September 15 2025: WanGP v8.6 - Attack of the Clones
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										17
									
								
								defaults/qwen_image_edit_plus_20B.json
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										17
									
								
								defaults/qwen_image_edit_plus_20B.json
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,17 @@
 | 
			
		||||
{
 | 
			
		||||
    "model": {
 | 
			
		||||
        "name": "Qwen Image Edit Plus 20B",
 | 
			
		||||
        "architecture": "qwen_image_edit_plus_20B",
 | 
			
		||||
        "description": "Qwen Image Edit Plus is a generative model that can generate very high quality images with long texts in it. Best results will be at 720p. This model is optimized to combine multiple Subjects & Objects.",
 | 
			
		||||
        "URLs": [
 | 
			
		||||
            "https://huggingface.co/DeepBeepMeep/Qwen_image/resolve/main/qwen_image_edit_plus_20B_quanto_bf16_int8.safetensors"
 | 
			
		||||
        ],
 | 
			
		||||
        "preload_URLs": "qwen_image_edit_20B",
 | 
			
		||||
        "attention": {
 | 
			
		||||
            "<89": "sdpa"
 | 
			
		||||
        }
 | 
			
		||||
    },
 | 
			
		||||
    "prompt": "add a hat",
 | 
			
		||||
    "resolution": "1024x1024",
 | 
			
		||||
    "batch_size": 1
 | 
			
		||||
}
 | 
			
		||||
@ -28,7 +28,7 @@ class family_handler():
 | 
			
		||||
            extra_model_def["any_image_refs_relative_size"] = True
 | 
			
		||||
            extra_model_def["no_background_removal"] = True
 | 
			
		||||
            extra_model_def["image_ref_choices"] = {
 | 
			
		||||
                "choices":[("No Reference Image", ""),("First Image is a Reference Image, and then the next ones (up to two) are Style Images", "KI"),
 | 
			
		||||
                "choices":[("First Image is a Reference Image, and then the next ones (up to two) are Style Images", "KI"),
 | 
			
		||||
                            ("Up to two Images are Style Images", "KIJ")],
 | 
			
		||||
                "default": "KI",
 | 
			
		||||
                "letters_filter": "KIJ",
 | 
			
		||||
 | 
			
		||||
@ -200,7 +200,8 @@ class QwenImagePipeline(): #DiffusionPipeline
 | 
			
		||||
        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor * 2)
 | 
			
		||||
        self.tokenizer_max_length = 1024
 | 
			
		||||
        if processor is not None:
 | 
			
		||||
            self.prompt_template_encode = "<|im_start|>system\nDescribe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>{}<|im_end|>\n<|im_start|>assistant\n"
 | 
			
		||||
            # self.prompt_template_encode = "<|im_start|>system\nDescribe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>{}<|im_end|>\n<|im_start|>assistant\n"
 | 
			
		||||
            self.prompt_template_encode = "<|im_start|>system\nDescribe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n"
 | 
			
		||||
            self.prompt_template_encode_start_idx = 64
 | 
			
		||||
        else:
 | 
			
		||||
            self.prompt_template_encode = "<|im_start|>system\nDescribe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n"
 | 
			
		||||
@ -232,6 +233,21 @@ class QwenImagePipeline(): #DiffusionPipeline
 | 
			
		||||
        txt = [template.format(e) for e in prompt]
 | 
			
		||||
 | 
			
		||||
        if self.processor is not None and image is not None:
 | 
			
		||||
            img_prompt_template = "Picture {}: <|vision_start|><|image_pad|><|vision_end|>"
 | 
			
		||||
            if isinstance(image, list):
 | 
			
		||||
                base_img_prompt = ""
 | 
			
		||||
                for i, img in enumerate(image):
 | 
			
		||||
                    base_img_prompt += img_prompt_template.format(i + 1)
 | 
			
		||||
            elif image is not None:
 | 
			
		||||
                base_img_prompt = img_prompt_template.format(1)
 | 
			
		||||
            else:
 | 
			
		||||
                base_img_prompt = ""
 | 
			
		||||
 | 
			
		||||
            template = self.prompt_template_encode
 | 
			
		||||
 | 
			
		||||
            drop_idx = self.prompt_template_encode_start_idx
 | 
			
		||||
            txt = [template.format(base_img_prompt + e) for e in prompt]
 | 
			
		||||
 | 
			
		||||
            model_inputs = self.processor(
 | 
			
		||||
                text=txt,
 | 
			
		||||
                images=image,
 | 
			
		||||
@ -464,7 +480,7 @@ class QwenImagePipeline(): #DiffusionPipeline
 | 
			
		||||
 | 
			
		||||
    def prepare_latents(
 | 
			
		||||
        self,
 | 
			
		||||
        image,
 | 
			
		||||
        images,
 | 
			
		||||
        batch_size,
 | 
			
		||||
        num_channels_latents,
 | 
			
		||||
        height,
 | 
			
		||||
@ -482,7 +498,11 @@ class QwenImagePipeline(): #DiffusionPipeline
 | 
			
		||||
        shape = (batch_size, num_channels_latents, 1, height, width)
 | 
			
		||||
 | 
			
		||||
        image_latents = None
 | 
			
		||||
        if image is not None:
 | 
			
		||||
        if images is not None and len(images ) > 0:
 | 
			
		||||
            if not isinstance(images, list):
 | 
			
		||||
                images = [images]
 | 
			
		||||
            all_image_latents = []
 | 
			
		||||
            for image in images:
 | 
			
		||||
                image = image.to(device=device, dtype=dtype)
 | 
			
		||||
                if image.shape[1] != self.latent_channels:
 | 
			
		||||
                    image_latents = self._encode_vae_image(image=image, generator=generator)
 | 
			
		||||
@ -500,6 +520,8 @@ class QwenImagePipeline(): #DiffusionPipeline
 | 
			
		||||
                    image_latents = torch.cat([image_latents], dim=0)
 | 
			
		||||
 | 
			
		||||
                image_latents = self._pack_latents(image_latents)
 | 
			
		||||
                all_image_latents.append(image_latents)
 | 
			
		||||
            image_latents = torch.cat(all_image_latents, dim=1)
 | 
			
		||||
 | 
			
		||||
        if isinstance(generator, list) and len(generator) != batch_size:
 | 
			
		||||
            raise ValueError(
 | 
			
		||||
@ -568,6 +590,7 @@ class QwenImagePipeline(): #DiffusionPipeline
 | 
			
		||||
        joint_pass= True,
 | 
			
		||||
        lora_inpaint = False,
 | 
			
		||||
        outpainting_dims = None,
 | 
			
		||||
        qwen_edit_plus = False,
 | 
			
		||||
    ):
 | 
			
		||||
        r"""
 | 
			
		||||
        Function invoked when calling the pipeline for generation.
 | 
			
		||||
@ -683,61 +706,54 @@ class QwenImagePipeline(): #DiffusionPipeline
 | 
			
		||||
            batch_size = prompt_embeds.shape[0]
 | 
			
		||||
        device = "cuda"
 | 
			
		||||
 | 
			
		||||
        prompt_image = None
 | 
			
		||||
        condition_images = []
 | 
			
		||||
        vae_image_sizes = []
 | 
			
		||||
        vae_images = []
 | 
			
		||||
        image_mask_latents = None
 | 
			
		||||
        if image is not None and not (isinstance(image, torch.Tensor) and image.size(1) == self.latent_channels):
 | 
			
		||||
            image = image[0] if isinstance(image, list) else image
 | 
			
		||||
            image_height, image_width = self.image_processor.get_default_height_width(image)
 | 
			
		||||
            aspect_ratio = image_width / image_height
 | 
			
		||||
            if False :
 | 
			
		||||
                _, image_width, image_height = min(
 | 
			
		||||
                    (abs(aspect_ratio - w / h), w, h) for w, h in PREFERRED_QWENIMAGE_RESOLUTIONS
 | 
			
		||||
                )
 | 
			
		||||
            image_width = image_width // multiple_of * multiple_of
 | 
			
		||||
            image_height = image_height // multiple_of * multiple_of
 | 
			
		||||
            ref_height, ref_width = 1568, 672
 | 
			
		||||
 | 
			
		||||
            if image_mask is None:
 | 
			
		||||
                if height * width < ref_height * ref_width: ref_height , ref_width = height , width  
 | 
			
		||||
                if image_height * image_width > ref_height * ref_width:
 | 
			
		||||
                    image_height, image_width = calculate_new_dimensions(ref_height, ref_width, image_height, image_width, False, block_size=multiple_of)
 | 
			
		||||
                if (image_width,image_height) != image.size:
 | 
			
		||||
                    image = image.resize((image_width,image_height), resample=Image.Resampling.LANCZOS) 
 | 
			
		||||
            elif not lora_inpaint:
 | 
			
		||||
                # _, image_width, image_height = min(
 | 
			
		||||
                #     (abs(aspect_ratio - w / h), w, h) for w, h in PREFERRED_QWENIMAGE_RESOLUTIONS
 | 
			
		||||
                # )
 | 
			
		||||
                image_height, image_width = calculate_new_dimensions(height, width, image_height, image_width, False, block_size=multiple_of)
 | 
			
		||||
                # image_height, image_width = calculate_new_dimensions(ref_height, ref_width, image_height, image_width, False, block_size=multiple_of)
 | 
			
		||||
                height, width = image_height, image_width
 | 
			
		||||
                image_mask_latents = convert_image_to_tensor(image_mask.resize((width // 8, height // 8), resample=Image.Resampling.LANCZOS))
 | 
			
		||||
        ref_size = 1024
 | 
			
		||||
        ref_text_encoder_size = 384 if qwen_edit_plus else 1024
 | 
			
		||||
        if image is not None:
 | 
			
		||||
            if not isinstance(image, list): image = [image]
 | 
			
		||||
            if height * width < ref_size * ref_size: ref_size =  round(math.sqrt(height * width))  
 | 
			
		||||
            for ref_no, img in enumerate(image):
 | 
			
		||||
                image_width, image_height = img.size
 | 
			
		||||
                any_mask = ref_no == 0 and image_mask is not None
 | 
			
		||||
                if (image_height * image_width > ref_size * ref_size) and not any_mask:
 | 
			
		||||
                    vae_height, vae_width =calculate_new_dimensions(ref_size, ref_size, image_height, image_width, False, block_size=multiple_of)
 | 
			
		||||
                else:
 | 
			
		||||
                    vae_height, vae_width = image_height, image_width 
 | 
			
		||||
                    vae_width = vae_width // multiple_of * multiple_of
 | 
			
		||||
                    vae_height = vae_height // multiple_of * multiple_of
 | 
			
		||||
                vae_image_sizes.append((vae_width, vae_height))
 | 
			
		||||
                condition_height, condition_width =calculate_new_dimensions(ref_text_encoder_size, ref_text_encoder_size, image_height, image_width, False, block_size=multiple_of)
 | 
			
		||||
                condition_images.append(img.resize((condition_width, condition_height), resample=Image.Resampling.LANCZOS) )
 | 
			
		||||
                if img.size != (vae_width, vae_height):
 | 
			
		||||
                    img = img.resize((vae_width, vae_height), resample=Image.Resampling.LANCZOS) 
 | 
			
		||||
                if any_mask :
 | 
			
		||||
                    if lora_inpaint:
 | 
			
		||||
                        image_mask_rebuilt = torch.where(convert_image_to_tensor(image_mask)>-0.5, 1., 0. )[0:1]
 | 
			
		||||
                        img = convert_image_to_tensor(img)
 | 
			
		||||
                        green = torch.tensor([-1.0, 1.0, -1.0]).to(img) 
 | 
			
		||||
                        green_image = green[:, None, None] .expand_as(img)
 | 
			
		||||
                        img = torch.where(image_mask_rebuilt > 0, green_image, img)
 | 
			
		||||
                        img = convert_tensor_to_image(img)
 | 
			
		||||
                    else:
 | 
			
		||||
                        image_mask_latents = convert_image_to_tensor(image_mask.resize((vae_width // 8, vae_height // 8), resample=Image.Resampling.LANCZOS))
 | 
			
		||||
                        image_mask_latents = torch.where(image_mask_latents>-0.5, 1., 0. )[0:1]
 | 
			
		||||
                        image_mask_rebuilt = image_mask_latents.repeat_interleave(8, dim=-1).repeat_interleave(8, dim=-2).unsqueeze(0)
 | 
			
		||||
                        # convert_tensor_to_image( image_mask_rebuilt.squeeze(0).repeat(3,1,1)).save("mmm.png")
 | 
			
		||||
                        image_mask_latents = image_mask_latents.to(device).unsqueeze(0).unsqueeze(0).repeat(1,16,1,1,1)
 | 
			
		||||
                        image_mask_latents = self._pack_latents(image_mask_latents)
 | 
			
		||||
                # img.save("nnn.png")
 | 
			
		||||
                vae_images.append( convert_image_to_tensor(img).unsqueeze(0).unsqueeze(2) )
 | 
			
		||||
 | 
			
		||||
            prompt_image = image
 | 
			
		||||
            if image.size != (image_width, image_height):
 | 
			
		||||
                image = image.resize((image_width, image_height), resample=Image.Resampling.LANCZOS)
 | 
			
		||||
 | 
			
		||||
            image = convert_image_to_tensor(image)
 | 
			
		||||
            if lora_inpaint:
 | 
			
		||||
                image_mask_rebuilt = torch.where(convert_image_to_tensor(image_mask)>-0.5, 1., 0. )[0:1]
 | 
			
		||||
                image_mask_latents = None
 | 
			
		||||
                green = torch.tensor([-1.0, 1.0, -1.0]).to(image) 
 | 
			
		||||
                green_image = green[:, None, None] .expand_as(image)
 | 
			
		||||
                image = torch.where(image_mask_rebuilt > 0, green_image, image)
 | 
			
		||||
                prompt_image = convert_tensor_to_image(image)
 | 
			
		||||
            image = image.unsqueeze(0).unsqueeze(2)
 | 
			
		||||
            # image.save("nnn.png")
 | 
			
		||||
 | 
			
		||||
        has_neg_prompt = negative_prompt is not None or (
 | 
			
		||||
            negative_prompt_embeds is not None and negative_prompt_embeds_mask is not None
 | 
			
		||||
        )
 | 
			
		||||
        do_true_cfg = true_cfg_scale > 1 and has_neg_prompt
 | 
			
		||||
        prompt_embeds, prompt_embeds_mask = self.encode_prompt(
 | 
			
		||||
            image=prompt_image,
 | 
			
		||||
            image=condition_images,
 | 
			
		||||
            prompt=prompt,
 | 
			
		||||
            prompt_embeds=prompt_embeds,
 | 
			
		||||
            prompt_embeds_mask=prompt_embeds_mask,
 | 
			
		||||
@ -747,7 +763,7 @@ class QwenImagePipeline(): #DiffusionPipeline
 | 
			
		||||
        )
 | 
			
		||||
        if do_true_cfg:
 | 
			
		||||
            negative_prompt_embeds, negative_prompt_embeds_mask = self.encode_prompt(
 | 
			
		||||
                image=prompt_image,
 | 
			
		||||
                image=condition_images,
 | 
			
		||||
                prompt=negative_prompt,
 | 
			
		||||
                prompt_embeds=negative_prompt_embeds,
 | 
			
		||||
                prompt_embeds_mask=negative_prompt_embeds_mask,
 | 
			
		||||
@ -763,7 +779,7 @@ class QwenImagePipeline(): #DiffusionPipeline
 | 
			
		||||
        # 4. Prepare latent variables
 | 
			
		||||
        num_channels_latents = self.transformer.in_channels // 4
 | 
			
		||||
        latents, image_latents = self.prepare_latents(
 | 
			
		||||
            image,
 | 
			
		||||
            vae_images,
 | 
			
		||||
            batch_size * num_images_per_prompt,
 | 
			
		||||
            num_channels_latents,
 | 
			
		||||
            height,
 | 
			
		||||
@ -779,7 +795,12 @@ class QwenImagePipeline(): #DiffusionPipeline
 | 
			
		||||
            img_shapes = [
 | 
			
		||||
                [
 | 
			
		||||
                    (1, height // self.vae_scale_factor // 2, width // self.vae_scale_factor // 2),
 | 
			
		||||
                    (1, image_height // self.vae_scale_factor // 2, image_width // self.vae_scale_factor // 2),
 | 
			
		||||
                    # (1, image_height // self.vae_scale_factor // 2, image_width // self.vae_scale_factor // 2),
 | 
			
		||||
                    *[
 | 
			
		||||
                        (1, vae_height // self.vae_scale_factor // 2, vae_width // self.vae_scale_factor // 2)
 | 
			
		||||
                        for vae_width, vae_height in vae_image_sizes
 | 
			
		||||
                    ],
 | 
			
		||||
 | 
			
		||||
                ]
 | 
			
		||||
            ] * batch_size
 | 
			
		||||
        else:
 | 
			
		||||
@ -971,7 +992,7 @@ class QwenImagePipeline(): #DiffusionPipeline
 | 
			
		||||
            latents = latents / latents_std + latents_mean
 | 
			
		||||
            output_image = self.vae.decode(latents, return_dict=False)[0][:, :, 0]
 | 
			
		||||
            if image_mask is not None and not lora_inpaint :  #not (lora_inpaint and outpainting_dims is not None):
 | 
			
		||||
                output_image = image.squeeze(2) * (1 - image_mask_rebuilt) + output_image.to(image) * image_mask_rebuilt 
 | 
			
		||||
                output_image = vae_images[0].squeeze(2) * (1 - image_mask_rebuilt) + output_image.to(vae_images[0]  ) * image_mask_rebuilt 
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
        return output_image
 | 
			
		||||
 | 
			
		||||
@ -20,7 +20,7 @@ class family_handler():
 | 
			
		||||
            "fit_into_canvas_image_refs": 0,
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        if base_model_type in ["qwen_image_edit_20B"]: 
 | 
			
		||||
        if base_model_type in ["qwen_image_edit_20B", "qwen_image_edit_plus_20B"]: 
 | 
			
		||||
            extra_model_def["inpaint_support"] = True
 | 
			
		||||
            extra_model_def["image_ref_choices"] = {
 | 
			
		||||
            "choices": [
 | 
			
		||||
@ -42,11 +42,20 @@ class family_handler():
 | 
			
		||||
                        "image_modes" : [2],
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
        if base_model_type in ["qwen_image_edit_plus_20B"]: 
 | 
			
		||||
            extra_model_def["guide_preprocessing"] = {
 | 
			
		||||
                    "selection": ["", "PV", "SV", "CV"],
 | 
			
		||||
                }
 | 
			
		||||
 | 
			
		||||
            extra_model_def["mask_preprocessing"] = {
 | 
			
		||||
                    "selection": ["", "A"],
 | 
			
		||||
                    "visible": False,
 | 
			
		||||
                }
 | 
			
		||||
        return extra_model_def
 | 
			
		||||
 | 
			
		||||
    @staticmethod
 | 
			
		||||
    def query_supported_types():
 | 
			
		||||
        return ["qwen_image_20B", "qwen_image_edit_20B"]
 | 
			
		||||
        return ["qwen_image_20B", "qwen_image_edit_20B", "qwen_image_edit_plus_20B"]
 | 
			
		||||
 | 
			
		||||
    @staticmethod
 | 
			
		||||
    def query_family_maps():
 | 
			
		||||
@ -113,9 +122,15 @@ class family_handler():
 | 
			
		||||
                "denoising_strength" : 1.,
 | 
			
		||||
                "model_mode" : 0,
 | 
			
		||||
            })
 | 
			
		||||
        elif base_model_type in ["qwen_image_edit_plus_20B"]: 
 | 
			
		||||
            ui_defaults.update({
 | 
			
		||||
                "video_prompt_type": "I",
 | 
			
		||||
                "denoising_strength" : 1.,
 | 
			
		||||
                "model_mode" : 0,
 | 
			
		||||
            })
 | 
			
		||||
 | 
			
		||||
    def validate_generative_settings(base_model_type, model_def, inputs):
 | 
			
		||||
        if base_model_type in ["qwen_image_edit_20B"]:
 | 
			
		||||
        if base_model_type in ["qwen_image_edit_20B", "qwen_image_edit_plus_20B"]:
 | 
			
		||||
            model_mode = inputs["model_mode"]
 | 
			
		||||
            denoising_strength= inputs["denoising_strength"]
 | 
			
		||||
            video_guide_outpainting= inputs["video_guide_outpainting"]
 | 
			
		||||
 | 
			
		||||
@ -51,10 +51,10 @@ class model_factory():
 | 
			
		||||
        transformer_filename = model_filename[0]
 | 
			
		||||
        processor = None
 | 
			
		||||
        tokenizer = None
 | 
			
		||||
        if base_model_type == "qwen_image_edit_20B":
 | 
			
		||||
        if base_model_type in ["qwen_image_edit_20B", "qwen_image_edit_plus_20B"]:
 | 
			
		||||
            processor = Qwen2VLProcessor.from_pretrained(os.path.join(checkpoint_dir,"Qwen2.5-VL-7B-Instruct"))
 | 
			
		||||
        tokenizer = AutoTokenizer.from_pretrained(os.path.join(checkpoint_dir,"Qwen2.5-VL-7B-Instruct"))
 | 
			
		||||
 | 
			
		||||
        self.base_model_type = base_model_type
 | 
			
		||||
 | 
			
		||||
        base_config_file = "configs/qwen_image_20B.json" 
 | 
			
		||||
        with open(base_config_file, 'r', encoding='utf-8') as f:
 | 
			
		||||
@ -173,7 +173,7 @@ class model_factory():
 | 
			
		||||
            self.vae.tile_latent_min_height  = VAE_tile_size[1] 
 | 
			
		||||
            self.vae.tile_latent_min_width  = VAE_tile_size[1]
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
        qwen_edit_plus = self.base_model_type in ["qwen_image_edit_plus_20B"]
 | 
			
		||||
        self.vae.enable_slicing()
 | 
			
		||||
        # width, height = aspect_ratios["16:9"]
 | 
			
		||||
 | 
			
		||||
@ -182,14 +182,16 @@ class model_factory():
 | 
			
		||||
 | 
			
		||||
        image_mask = None if input_masks is None else convert_tensor_to_image(input_masks, mask_levels= True) 
 | 
			
		||||
        if input_frames is not None:
 | 
			
		||||
            input_ref_images = [convert_tensor_to_image(input_frames) ] 
 | 
			
		||||
        elif input_ref_images is not None:
 | 
			
		||||
            input_ref_images = [convert_tensor_to_image(input_frames) ] +  ([] if input_ref_images  is None else input_ref_images )
 | 
			
		||||
 | 
			
		||||
        if input_ref_images is not None:
 | 
			
		||||
            # image stiching method
 | 
			
		||||
            stiched = input_ref_images[0]
 | 
			
		||||
            if "K" in video_prompt_type :
 | 
			
		||||
                w, h = input_ref_images[0].size
 | 
			
		||||
                height, width = calculate_new_dimensions(height, width, h, w, fit_into_canvas)
 | 
			
		||||
 | 
			
		||||
            if not qwen_edit_plus:
 | 
			
		||||
                for new_img in input_ref_images[1:]:
 | 
			
		||||
                    stiched = stitch_images(stiched, new_img)
 | 
			
		||||
                input_ref_images  = [stiched]
 | 
			
		||||
@ -212,6 +214,7 @@ class model_factory():
 | 
			
		||||
            generator=torch.Generator(device="cuda").manual_seed(seed),
 | 
			
		||||
            lora_inpaint = image_mask is not None and model_mode == 1,
 | 
			
		||||
            outpainting_dims = outpainting_dims,
 | 
			
		||||
            qwen_edit_plus = qwen_edit_plus,
 | 
			
		||||
        )      
 | 
			
		||||
        if image is None: return None
 | 
			
		||||
        return image.transpose(0, 1)
 | 
			
		||||
 | 
			
		||||
@ -21,6 +21,7 @@ from .utils.get_default_model import get_matanyone_model
 | 
			
		||||
from .matanyone.inference.inference_core import InferenceCore
 | 
			
		||||
from .matanyone_wrapper import matanyone
 | 
			
		||||
from shared.utils.audio_video import save_video, save_image
 | 
			
		||||
from mmgp import offload
 | 
			
		||||
 | 
			
		||||
arg_device = "cuda"
 | 
			
		||||
arg_sam_model_type="vit_h"
 | 
			
		||||
@ -539,7 +540,7 @@ def video_matting(video_state,video_input, end_slider, matting_type, interactive
 | 
			
		||||
    file_name = ".".join(file_name.split(".")[:-1]) 
 | 
			
		||||
 
 | 
			
		||||
    from shared.utils.audio_video import extract_audio_tracks, combine_video_with_audio_tracks, cleanup_temp_audio_files    
 | 
			
		||||
    source_audio_tracks, audio_metadata  = extract_audio_tracks(video_input)
 | 
			
		||||
    source_audio_tracks, audio_metadata  = extract_audio_tracks(video_input, verbose= offload.default_verboseLevel )
 | 
			
		||||
    output_fg_path =  f"./mask_outputs/{file_name}_fg.mp4"
 | 
			
		||||
    output_fg_temp_path =  f"./mask_outputs/{file_name}_fg_tmp.mp4"
 | 
			
		||||
    if len(source_audio_tracks) == 0:
 | 
			
		||||
@ -679,7 +680,6 @@ def load_unload_models(selected):
 | 
			
		||||
            }
 | 
			
		||||
            # os.path.join('.')
 | 
			
		||||
 | 
			
		||||
            from mmgp import offload
 | 
			
		||||
 | 
			
		||||
            # sam_checkpoint = load_file_from_url(sam_checkpoint_url_dict[arg_sam_model_type], ".")
 | 
			
		||||
            sam_checkpoint = None
 | 
			
		||||
 | 
			
		||||
@ -321,7 +321,7 @@ def fit_image_into_canvas(ref_img, image_size, canvas_tf_bg =127.5, device ="cpu
 | 
			
		||||
    ref_width, ref_height = ref_img.size
 | 
			
		||||
    if (ref_height, ref_width) == image_size and outpainting_dims  == None:
 | 
			
		||||
        ref_img = TF.to_tensor(ref_img).sub_(0.5).div_(0.5).unsqueeze(1)
 | 
			
		||||
        canvas = torch.zeros_like(ref_img) if return_mask else None
 | 
			
		||||
        canvas = torch.zeros_like(ref_img[:1]) if return_mask else None
 | 
			
		||||
    else:
 | 
			
		||||
        if outpainting_dims != None:
 | 
			
		||||
            final_height, final_width = image_size
 | 
			
		||||
@ -374,7 +374,7 @@ def prepare_video_guide_and_mask( video_guides, video_masks, pre_video_guide, im
 | 
			
		||||
        if pre_video_guide is not None:
 | 
			
		||||
            src_video = pre_video_guide if src_video is None else torch.cat( [pre_video_guide, src_video], dim=1)
 | 
			
		||||
            if any_mask:
 | 
			
		||||
                src_mask = torch.zeros_like(pre_video_guide[0:1]) if src_mask is None else torch.cat( [torch.zeros_like(pre_video_guide[0:1]), src_mask], dim=1)
 | 
			
		||||
                src_mask = torch.zeros_like(pre_video_guide[:1]) if src_mask is None else torch.cat( [torch.zeros_like(pre_video_guide[:1]), src_mask], dim=1)
 | 
			
		||||
 | 
			
		||||
        if any_guide_padding:
 | 
			
		||||
            if src_video is None:
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										31
									
								
								wgp.py
									
									
									
									
									
								
							
							
						
						
									
										31
									
								
								wgp.py
									
									
									
									
									
								
							@ -63,7 +63,7 @@ AUTOSAVE_FILENAME = "queue.zip"
 | 
			
		||||
PROMPT_VARS_MAX = 10
 | 
			
		||||
 | 
			
		||||
target_mmgp_version = "3.6.0"
 | 
			
		||||
WanGP_version = "8.71"
 | 
			
		||||
WanGP_version = "8.72"
 | 
			
		||||
settings_version = 2.35
 | 
			
		||||
max_source_video_frames = 3000
 | 
			
		||||
prompt_enhancer_image_caption_model, prompt_enhancer_image_caption_processor, prompt_enhancer_llm_model, prompt_enhancer_llm_tokenizer = None, None, None, None
 | 
			
		||||
@ -4987,7 +4987,7 @@ def generate_video(
 | 
			
		||||
                        frames_to_inject[pos] = image_refs[i] 
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
            video_guide_processed = video_mask_processed = video_guide_processed2 = video_mask_processed2 = None
 | 
			
		||||
            video_guide_processed = video_mask_processed = video_guide_processed2 = video_mask_processed2 = sparse_video_image = None
 | 
			
		||||
            if video_guide is not None:
 | 
			
		||||
                keep_frames_parsed_full, error = parse_keep_frames_video_guide(keep_frames_video_guide, source_video_frames_count -source_video_overlap_frames_count + requested_frames_to_generate)
 | 
			
		||||
                if len(error) > 0:
 | 
			
		||||
@ -6566,11 +6566,11 @@ def switch_image_mode(state):
 | 
			
		||||
    inpaint_support = model_def.get("inpaint_support", False)
 | 
			
		||||
    if inpaint_support:
 | 
			
		||||
        if image_mode == 1:
 | 
			
		||||
            video_prompt_type = del_in_sequence(video_prompt_type, "VAG")  
 | 
			
		||||
            video_prompt_type = del_in_sequence(video_prompt_type, "VAG" + all_guide_processes)  
 | 
			
		||||
            video_prompt_type = add_to_sequence(video_prompt_type, "KI")
 | 
			
		||||
        elif image_mode == 2:
 | 
			
		||||
            video_prompt_type = del_in_sequence(video_prompt_type, "KI" + all_guide_processes)
 | 
			
		||||
            video_prompt_type = add_to_sequence(video_prompt_type, "VAG")  
 | 
			
		||||
            video_prompt_type = del_in_sequence(video_prompt_type, "KI")
 | 
			
		||||
        ui_defaults["video_prompt_type"] = video_prompt_type 
 | 
			
		||||
        
 | 
			
		||||
    return  str(time.time())
 | 
			
		||||
@ -6965,10 +6965,11 @@ def refresh_video_prompt_type_alignment(state, video_prompt_type, video_prompt_t
 | 
			
		||||
    video_prompt_type = add_to_sequence(video_prompt_type, video_prompt_type_video_guide)
 | 
			
		||||
    return video_prompt_type
 | 
			
		||||
 | 
			
		||||
all_guide_processes ="PDESLCMUVB"
 | 
			
		||||
 | 
			
		||||
def refresh_video_prompt_type_video_guide(state, video_prompt_type, video_prompt_type_video_guide,  image_mode, old_image_mask_guide_value, old_image_guide_value, old_image_mask_value ):
 | 
			
		||||
    old_video_prompt_type = video_prompt_type
 | 
			
		||||
    video_prompt_type = del_in_sequence(video_prompt_type, "PDESLCMUVB")
 | 
			
		||||
    video_prompt_type = del_in_sequence(video_prompt_type, all_guide_processes)
 | 
			
		||||
    video_prompt_type = add_to_sequence(video_prompt_type, video_prompt_type_video_guide)
 | 
			
		||||
    visible = "V" in video_prompt_type
 | 
			
		||||
    model_type = state["model_type"]
 | 
			
		||||
@ -6978,8 +6979,12 @@ def refresh_video_prompt_type_video_guide(state, video_prompt_type, video_prompt
 | 
			
		||||
    image_outputs =  image_mode > 0
 | 
			
		||||
    keep_frames_video_guide_visible = not image_outputs and visible and not model_def.get("keep_frames_video_guide_not_supported", False)
 | 
			
		||||
    image_mask_guide, image_guide, image_mask = switch_image_guide_editor(image_mode, old_video_prompt_type , video_prompt_type, old_image_mask_guide_value, old_image_guide_value, old_image_mask_value )
 | 
			
		||||
 | 
			
		||||
    return video_prompt_type,  gr.update(visible = visible and not image_outputs), image_guide, gr.update(visible = keep_frames_video_guide_visible), gr.update(visible = visible and "G" in video_prompt_type), gr.update(visible= (visible or "F" in video_prompt_type or "K" in video_prompt_type) and any_outpainting), gr.update(visible= visible and not "U" in video_prompt_type ),  gr.update(visible= mask_visible and not image_outputs), image_mask, image_mask_guide, gr.update(visible= mask_visible) 
 | 
			
		||||
    mask_preprocessing = model_def.get("mask_preprocessing", None)
 | 
			
		||||
    if mask_preprocessing  is not None:
 | 
			
		||||
        mask_selector_visible = mask_preprocessing.get("visible", True)
 | 
			
		||||
    else:
 | 
			
		||||
        mask_selector_visible = True
 | 
			
		||||
    return video_prompt_type,  gr.update(visible = visible and not image_outputs), image_guide, gr.update(visible = keep_frames_video_guide_visible), gr.update(visible = visible and "G" in video_prompt_type), gr.update(visible= (visible or "F" in video_prompt_type or "K" in video_prompt_type) and any_outpainting), gr.update(visible= visible and mask_selector_visible and  not "U" in video_prompt_type ) ,  gr.update(visible= mask_visible and not image_outputs), image_mask, image_mask_guide, gr.update(visible= mask_visible) 
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def refresh_video_prompt_type_video_guide_alt(state, video_prompt_type, video_prompt_type_video_guide_alt, image_mode):
 | 
			
		||||
@ -7391,7 +7396,7 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non
 | 
			
		||||
            any_start_image = any_end_image = any_reference_image = any_image_mask = False
 | 
			
		||||
            v2i_switch_supported = (vace or t2v or standin) and not image_outputs
 | 
			
		||||
            ti2v_2_2 = base_model_type in ["ti2v_2_2"]
 | 
			
		||||
            gallery_height = 350
 | 
			
		||||
            gallery_height = 550
 | 
			
		||||
            def get_image_gallery(label ="", value = None, single_image_mode = False, visible = False ):
 | 
			
		||||
                with gr.Row(visible = visible) as gallery_row:
 | 
			
		||||
                    gallery_amg = AdvancedMediaGallery(media_mode="image", height=gallery_height, columns=4, label=label, initial = value , single_image_mode = single_image_mode )
 | 
			
		||||
@ -7459,6 +7464,10 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non
 | 
			
		||||
                keep_frames_video_source = gr.Text(value=ui_defaults.get("keep_frames_video_source","") , visible= len(filter_letters(image_prompt_type_value, "VL"))>0 , scale = 2, label= "Truncate Video beyond this number of resampled Frames (empty=Keep All, negative truncates from End)" ) 
 | 
			
		||||
 | 
			
		||||
            any_control_video = any_control_image = False
 | 
			
		||||
            if image_mode_value ==2:
 | 
			
		||||
                guide_preprocessing = { "selection": ["V", "VG"]}
 | 
			
		||||
                mask_preprocessing = { "selection": ["A"]}
 | 
			
		||||
            else:
 | 
			
		||||
                guide_preprocessing = model_def.get("guide_preprocessing", None)
 | 
			
		||||
                mask_preprocessing = model_def.get("mask_preprocessing", None)
 | 
			
		||||
            guide_custom_choices = model_def.get("guide_custom_choices", None)
 | 
			
		||||
@ -7504,7 +7513,7 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non
 | 
			
		||||
                        if image_outputs: video_prompt_type_video_guide_label = video_prompt_type_video_guide_label.replace("Video", "Image")
 | 
			
		||||
                        video_prompt_type_video_guide = gr.Dropdown(
 | 
			
		||||
                            guide_preprocessing_choices,
 | 
			
		||||
                            value=filter_letters(video_prompt_type_value, "PDESLCMUVB", guide_preprocessing.get("default", "") ),
 | 
			
		||||
                            value=filter_letters(video_prompt_type_value,  all_guide_processes, guide_preprocessing.get("default", "") ),
 | 
			
		||||
                            label= video_prompt_type_video_guide_label , scale = 2, visible= guide_preprocessing.get("visible", True) , show_label= True,
 | 
			
		||||
                        )
 | 
			
		||||
                        any_control_video = True
 | 
			
		||||
@ -7590,8 +7599,8 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non
 | 
			
		||||
                    if image_guide_value is None:
 | 
			
		||||
                        image_mask_guide_value = None
 | 
			
		||||
                    else:
 | 
			
		||||
                        image_mask_value = rgb_bw_to_rgba_mask(image_mask_value)
 | 
			
		||||
                        image_mask_guide_value = { "background" : image_guide_value, "composite" : None, "layers": [image_mask_value] }
 | 
			
		||||
                        image_mask_guide_value = { "background" : image_guide_value, "composite" : None}
 | 
			
		||||
                        image_mask_guide_value["layers"] = [] if image_mask_value is None else [rgb_bw_to_rgba_mask(image_mask_value)]
 | 
			
		||||
 | 
			
		||||
                    image_mask_guide = gr.ImageEditor(
 | 
			
		||||
                        label="Control Image to be Inpainted" if image_mode_value == 2 else "Control Image and Mask",
 | 
			
		||||
 | 
			
		||||
		Loading…
	
		Reference in New Issue
	
	Block a user