From f9f63cbc79c364ac146d833292f0ab89e199228c Mon Sep 17 00:00:00 2001
From: DeepBeepMeep <deepbeepmeep@yahoo.com>
Date: Tue, 9 Sep 2025 21:41:35 +0200
Subject: [PATCH 1/3] intermediate commit

---
 defaults/flux_dev_kontext.json    |   2 -
 defaults/flux_dev_uso.json        |   2 -
 defaults/qwen_image_edit_20B.json |   4 +-
 models/flux/flux_handler.py       |  37 +-
 models/flux/flux_main.py          |  69 +--
 models/hyvideo/hunyuan.py         |   5 -
 models/hyvideo/hunyuan_handler.py |  21 +
 models/ltx_video/ltxv.py          |   3 -
 models/ltx_video/ltxv_handler.py  |   9 +
 models/qwen/pipeline_qwenimage.py |  65 ++-
 models/qwen/qwen_handler.py       |  20 +-
 models/qwen/qwen_main.py          |  10 +-
 models/wan/any2video.py           |  41 +-
 models/wan/df_handler.py          |   2 +-
 models/wan/wan_handler.py         |  74 ++-
 preprocessing/matanyone/app.py    |  69 ++-
 requirements.txt                  |   2 +-
 shared/gradio/gallery.py          | 141 +++--
 shared/utils/utils.py             |  53 +-
 wgp.py                            | 839 +++++++++++++++++-------------
 20 files changed, 897 insertions(+), 571 deletions(-)

diff --git a/defaults/flux_dev_kontext.json b/defaults/flux_dev_kontext.json
index 8945918..20b6bc4 100644
--- a/defaults/flux_dev_kontext.json
+++ b/defaults/flux_dev_kontext.json
@@ -7,8 +7,6 @@
             "https://huggingface.co/DeepBeepMeep/Flux/resolve/main/flux1_kontext_dev_bf16.safetensors",
             "https://huggingface.co/DeepBeepMeep/Flux/resolve/main/flux1_kontext_dev_quanto_bf16_int8.safetensors"
         ],
-		"image_outputs": true,		
-		"reference_image": true,		
 		"flux-model": "flux-dev-kontext"		
     },
 	"prompt": "add a hat",
diff --git a/defaults/flux_dev_uso.json b/defaults/flux_dev_uso.json
index 8b5dbb6..0cd7b82 100644
--- a/defaults/flux_dev_uso.json
+++ b/defaults/flux_dev_uso.json
@@ -6,8 +6,6 @@
 		"modules": [ ["https://huggingface.co/DeepBeepMeep/Flux/resolve/main/flux1-dev-USO_projector_bf16.safetensors"]],
         "URLs": "flux",
 		"loras": ["https://huggingface.co/DeepBeepMeep/Flux/resolve/main/flux1-dev-USO_dit_lora_bf16.safetensors"],
-		"image_outputs": true,		
-		"reference_image": true,		
 		"flux-model": "flux-dev-uso"		
     },
 	"prompt": "the man is wearing a hat",
diff --git a/defaults/qwen_image_edit_20B.json b/defaults/qwen_image_edit_20B.json
index 2b24c72..79b8b24 100644
--- a/defaults/qwen_image_edit_20B.json
+++ b/defaults/qwen_image_edit_20B.json
@@ -9,9 +9,7 @@
         ],
         "attention": {
             "<89": "sdpa"
-        },
-        "reference_image": true,
-        "image_outputs": true
+        }
     },
     "prompt": "add a hat",
     "resolution": "1280x720",
diff --git a/models/flux/flux_handler.py b/models/flux/flux_handler.py
index b8b7b9b..c468d5a 100644
--- a/models/flux/flux_handler.py
+++ b/models/flux/flux_handler.py
@@ -13,28 +13,41 @@ class family_handler():
         flux_schnell = flux_model == "flux-schnell" 
         flux_chroma = flux_model == "flux-chroma" 
         flux_uso = flux_model == "flux-dev-uso"
-        model_def_output = {
+        flux_kontext = flux_model == "flux-dev-kontext"
+        
+        extra_model_def = {
             "image_outputs" : True,
             "no_negative_prompt" : not flux_chroma,
         }
         if flux_chroma:
-            model_def_output["guidance_max_phases"] = 1
+            extra_model_def["guidance_max_phases"] = 1
         elif not flux_schnell:
-            model_def_output["embedded_guidance"] = True
+            extra_model_def["embedded_guidance"] = True
         if flux_uso :
-            model_def_output["any_image_refs_relative_size"] = True
-            model_def_output["no_background_removal"] = True
-
-            model_def_output["image_ref_choices"] = {
+            extra_model_def["any_image_refs_relative_size"] = True
+            extra_model_def["no_background_removal"] = True
+            extra_model_def["image_ref_choices"] = {
                 "choices":[("No Reference Image", ""),("First Image is a Reference Image, and then the next ones (up to two) are Style Images", "KI"),
                             ("Up to two Images are Style Images", "KIJ")],
                 "default": "KI",
                 "letters_filter": "KIJ",
                 "label": "Reference Images / Style Images"
             }
-        model_def_output["lock_image_refs_ratios"] = True
+        
+        if flux_kontext:
+            extra_model_def["image_ref_choices"] = {
+                "choices": [
+                    ("None", ""),
+                    ("Conditional Images is first Main Subject / Landscape and may be followed by People / Objects", "KI"),
+                    ("Conditional Images are People / Objects", "I"),
+                    ],
+                "letters_filter": "KI",
+            }
 
-        return model_def_output
+
+        extra_model_def["lock_image_refs_ratios"] = True
+
+        return extra_model_def
 
     @staticmethod
     def query_supported_types():
@@ -122,10 +135,12 @@ class family_handler():
     def update_default_settings(base_model_type, model_def, ui_defaults):
         flux_model = model_def.get("flux-model", "flux-dev")
         flux_uso = flux_model == "flux-dev-uso"
+        flux_kontext = flux_model == "flux-dev-kontext"
         ui_defaults.update({
             "embedded_guidance":  2.5,
-        })            
-        if model_def.get("reference_image", False):
+        })
+
+        if flux_kontext or flux_uso:
             ui_defaults.update({
                 "video_prompt_type": "KI",
             })
diff --git a/models/flux/flux_main.py b/models/flux/flux_main.py
index 9bb8e73..4d7c67d 100644
--- a/models/flux/flux_main.py
+++ b/models/flux/flux_main.py
@@ -24,44 +24,6 @@ from .util import (
 
 from PIL import Image
 
-def resize_and_centercrop_image(image, target_height_ref1, target_width_ref1):
-    target_height_ref1 = int(target_height_ref1 // 64 * 64)
-    target_width_ref1 = int(target_width_ref1 // 64 * 64)
-    h, w = image.shape[-2:]
-    if h < target_height_ref1 or w < target_width_ref1:
-        # 计算长宽比
-        aspect_ratio = w / h
-        if h < target_height_ref1:
-            new_h = target_height_ref1
-            new_w = new_h * aspect_ratio
-            if new_w < target_width_ref1:
-                new_w = target_width_ref1
-                new_h = new_w / aspect_ratio
-        else:
-            new_w = target_width_ref1
-            new_h = new_w / aspect_ratio
-            if new_h < target_height_ref1:
-                new_h = target_height_ref1
-                new_w = new_h * aspect_ratio
-    else:
-        aspect_ratio = w / h
-        tgt_aspect_ratio = target_width_ref1 / target_height_ref1
-        if aspect_ratio > tgt_aspect_ratio:
-            new_h = target_height_ref1
-            new_w = new_h * aspect_ratio
-        else:
-            new_w = target_width_ref1
-            new_h = new_w / aspect_ratio
-    # 使用 TVF.resize 进行图像缩放
-    image = TVF.resize(image, (math.ceil(new_h), math.ceil(new_w)))
-    # 计算中心裁剪的参数
-    top = (image.shape[-2] - target_height_ref1) // 2
-    left = (image.shape[-1] - target_width_ref1) // 2
-    # 使用 TVF.crop 进行中心裁剪
-    image = TVF.crop(image, top, left, target_height_ref1, target_width_ref1)
-    return image
-
-
 def stitch_images(img1, img2):
     # Resize img2 to match img1's height
     width1, height1 = img1.size
@@ -171,8 +133,6 @@ class model_factory:
             device="cuda"
             flux_dev_uso = self.name in ['flux-dev-uso']
             image_stiching =  not self.name in ['flux-dev-uso'] #and False
-            # image_refs_relative_size = 100
-            crop = False
             input_ref_images = [] if input_ref_images is None else input_ref_images[:]
             ref_style_imgs = []
             if "I" in video_prompt_type and len(input_ref_images) > 0: 
@@ -186,36 +146,15 @@ class model_factory:
                 if image_stiching:
                     # image stiching method
                     stiched = input_ref_images[0]
-                    if "K" in video_prompt_type :
-                        w, h = input_ref_images[0].size
-                        height, width = calculate_new_dimensions(height, width, h, w, fit_into_canvas)
-                        # actual rescale will happen in prepare_kontext
                     for new_img in input_ref_images[1:]:
                         stiched = stitch_images(stiched, new_img)
                     input_ref_images  = [stiched]
                 else:
-                    first_ref = 0
-                    if "K" in video_prompt_type:
-                        # image latents tiling method
-                        w, h = input_ref_images[0].size
-                        if crop :
-                            img = convert_image_to_tensor(input_ref_images[0])
-                            img = resize_and_centercrop_image(img, height, width)                       
-                            input_ref_images[0] = convert_tensor_to_image(img)                    
-                        else:
-                            height, width = calculate_new_dimensions(height, width, h, w, fit_into_canvas)
-                            input_ref_images[0] = input_ref_images[0].resize((width, height), resample=Image.Resampling.LANCZOS) 
-                        first_ref = 1
-
-                    for i in range(first_ref,len(input_ref_images)):
+                    # latents stiching with resize 
+                    for i in range(len(input_ref_images)):
                         w, h = input_ref_images[i].size
-                        if crop:
-                            img = convert_image_to_tensor(input_ref_images[i])
-                            img = resize_and_centercrop_image(img, int(height*image_refs_relative_size/100), int(width*image_refs_relative_size/100)) 
-                            input_ref_images[i] = convert_tensor_to_image(img)                    
-                        else:
-                            image_height, image_width = calculate_new_dimensions(int(height*image_refs_relative_size/100), int(width*image_refs_relative_size/100), h, w, fit_into_canvas)
-                            input_ref_images[i] = input_ref_images[i].resize((image_width, image_height), resample=Image.Resampling.LANCZOS) 
+                        image_height, image_width = calculate_new_dimensions(int(height*image_refs_relative_size/100), int(width*image_refs_relative_size/100), h, w, fit_into_canvas)
+                        input_ref_images[i] = input_ref_images[i].resize((image_width, image_height), resample=Image.Resampling.LANCZOS) 
             else:
                 input_ref_images = None
 
diff --git a/models/hyvideo/hunyuan.py b/models/hyvideo/hunyuan.py
index a38a7bd..181a9a7 100644
--- a/models/hyvideo/hunyuan.py
+++ b/models/hyvideo/hunyuan.py
@@ -861,11 +861,6 @@ class HunyuanVideoSampler(Inference):
             freqs_cos, freqs_sin = self.get_rotary_pos_embed(target_frame_num, target_height, target_width, enable_RIFLEx)
         else:
             if self.avatar:
-                w, h = input_ref_images.size
-                target_height, target_width = calculate_new_dimensions(target_height, target_width, h, w, fit_into_canvas)
-                if target_width != w or target_height != h:
-                    input_ref_images = input_ref_images.resize((target_width,target_height), resample=Image.Resampling.LANCZOS) 
-
                 concat_dict = {'mode': 'timecat', 'bias': -1} 
                 freqs_cos, freqs_sin = self.get_rotary_pos_embed_new(129, target_height, target_width, concat_dict)
             else:
diff --git a/models/hyvideo/hunyuan_handler.py b/models/hyvideo/hunyuan_handler.py
index 9cbaea7..487e76d 100644
--- a/models/hyvideo/hunyuan_handler.py
+++ b/models/hyvideo/hunyuan_handler.py
@@ -51,6 +51,23 @@ class family_handler():
         extra_model_def["tea_cache"] = True
         extra_model_def["mag_cache"] = True
 
+        if base_model_type in ["hunyuan_custom_edit"]:
+            extra_model_def["guide_preprocessing"] = {
+                "selection": ["MV", "PMV"],
+            }
+
+            extra_model_def["mask_preprocessing"] = {
+                "selection": ["A", "NA"],
+                "default" : "NA"
+            }
+
+        if base_model_type in ["hunyuan_custom_audio", "hunyuan_custom_edit", "hunyuan_custom"]:
+            extra_model_def["image_ref_choices"] = {
+                "choices": [("Reference Image", "I")],
+                "letters_filter":"I",
+                "visible": False,
+            }
+
         if base_model_type in ["hunyuan_avatar"]: extra_model_def["no_background_removal"] = True
 
         if base_model_type in ["hunyuan_custom", "hunyuan_custom_edit", "hunyuan_custom_audio", "hunyuan_avatar"]:
@@ -141,6 +158,10 @@ class family_handler():
 
         return hunyuan_model, pipe
 
+    @staticmethod
+    def fix_settings(base_model_type, settings_version, model_def, ui_defaults):
+        pass
+    
     @staticmethod
     def update_default_settings(base_model_type, model_def, ui_defaults):
         ui_defaults["embedded_guidance_scale"]= 6.0
diff --git a/models/ltx_video/ltxv.py b/models/ltx_video/ltxv.py
index e71ac4f..db143fc 100644
--- a/models/ltx_video/ltxv.py
+++ b/models/ltx_video/ltxv.py
@@ -300,9 +300,6 @@ class LTXV:
             prefix_size, height, width = input_video.shape[-3:]
         else:
             if image_start != None:
-                frame_width, frame_height  = image_start.size
-                if fit_into_canvas != None:
-                    height, width = calculate_new_dimensions(height, width, frame_height, frame_width, fit_into_canvas, 32)
                 conditioning_media_paths.append(image_start.unsqueeze(1)) 
                 conditioning_start_frames.append(0)
                 conditioning_control_frames.append(False)
diff --git a/models/ltx_video/ltxv_handler.py b/models/ltx_video/ltxv_handler.py
index c89b69a..e44c983 100644
--- a/models/ltx_video/ltxv_handler.py
+++ b/models/ltx_video/ltxv_handler.py
@@ -26,6 +26,15 @@ class family_handler():
         extra_model_def["sliding_window"] = True
         extra_model_def["image_prompt_types_allowed"] = "TSEV"
 
+        extra_model_def["guide_preprocessing"] = {
+            "selection": ["", "PV", "DV", "EV", "V"],
+            "labels" : { "V": "Use LTXV raw format"}
+        }
+
+        extra_model_def["mask_preprocessing"] = {
+            "selection": ["", "A", "NA", "XA", "XNA"],
+        }
+
         return extra_model_def
 
     @staticmethod
diff --git a/models/qwen/pipeline_qwenimage.py b/models/qwen/pipeline_qwenimage.py
index 07bdbd4..905b864 100644
--- a/models/qwen/pipeline_qwenimage.py
+++ b/models/qwen/pipeline_qwenimage.py
@@ -28,7 +28,7 @@ from transformers import Qwen2_5_VLForConditionalGeneration, Qwen2Tokenizer, Aut
 from .autoencoder_kl_qwenimage import AutoencoderKLQwenImage
 from diffusers import FlowMatchEulerDiscreteScheduler
 from PIL import Image
-from shared.utils.utils import calculate_new_dimensions
+from shared.utils.utils import calculate_new_dimensions, convert_image_to_tensor, convert_tensor_to_image
 
 XLA_AVAILABLE = False
 
@@ -563,6 +563,8 @@ class QwenImagePipeline(): #DiffusionPipeline
         callback_on_step_end_tensor_inputs: List[str] = ["latents"],
         max_sequence_length: int = 512,
         image = None,
+        image_mask = None,
+        denoising_strength = 0,
         callback=None,
         pipeline=None,
         loras_slists=None,
@@ -694,14 +696,33 @@ class QwenImagePipeline(): #DiffusionPipeline
             image_width = image_width // multiple_of * multiple_of
             image_height = image_height // multiple_of * multiple_of
             ref_height, ref_width = 1568, 672
-            if height * width < ref_height * ref_width: ref_height , ref_width = height , width  
-            if image_height * image_width > ref_height * ref_width:
-                image_height, image_width = calculate_new_dimensions(ref_height, ref_width, image_height, image_width, False, block_size=multiple_of)
 
-            image = image.resize((image_width,image_height), resample=Image.Resampling.LANCZOS) 
+            if image_mask is None:
+                if height * width < ref_height * ref_width: ref_height , ref_width = height , width  
+                if image_height * image_width > ref_height * ref_width:
+                    image_height, image_width = calculate_new_dimensions(ref_height, ref_width, image_height, image_width, False, block_size=multiple_of)
+                if (image_width,image_height) != image.size:
+                    image = image.resize((image_width,image_height), resample=Image.Resampling.LANCZOS) 
+                image_mask_latents = None
+            else:
+                # _, image_width, image_height = min(
+                #     (abs(aspect_ratio - w / h), w, h) for w, h in PREFERRED_QWENIMAGE_RESOLUTIONS
+                # )
+                image_height, image_width = calculate_new_dimensions(height, width, image_height, image_width, False, block_size=multiple_of)
+                # image_height, image_width = calculate_new_dimensions(ref_height, ref_width, image_height, image_width, False, block_size=multiple_of)
+                height, width = image_height, image_width
+                image_mask_latents = convert_image_to_tensor(image_mask.resize((width // 16, height // 16), resample=Image.Resampling.LANCZOS))
+                image_mask_latents = torch.where(image_mask_latents>-0.5, 1., 0. )[0:1]
+                image_mask_rebuilt = image_mask_latents.repeat_interleave(16, dim=-1).repeat_interleave(16, dim=-2).unsqueeze(0)
+                convert_tensor_to_image( image_mask_rebuilt.squeeze(0).repeat(3,1,1)).save("mmm.png")
+                image_mask_latents = image_mask_latents.reshape(1, -1, 1).to(device)
+
             prompt_image = image
-            image = self.image_processor.preprocess(image, image_height, image_width)
-            image = image.unsqueeze(2)
+            if image.size != (image_width, image_height):
+                image = image.resize((image_width, image_height), resample=Image.Resampling.LANCZOS)
+
+            image.save("nnn.png")
+            image = convert_image_to_tensor(image).unsqueeze(0).unsqueeze(2)
 
         has_neg_prompt = negative_prompt is not None or (
             negative_prompt_embeds is not None and negative_prompt_embeds_mask is not None
@@ -744,6 +765,8 @@ class QwenImagePipeline(): #DiffusionPipeline
             generator,
             latents,
         )
+        original_image_latents = None if image_latents is None else image_latents.clone() 
+
         if image is not None:
             img_shapes = [
                 [
@@ -788,6 +811,15 @@ class QwenImagePipeline(): #DiffusionPipeline
         negative_txt_seq_lens = (
             negative_prompt_embeds_mask.sum(dim=1).tolist() if negative_prompt_embeds_mask is not None else None
         )
+        morph = False
+        if image_mask_latents is not None and denoising_strength <= 1.:
+            first_step = int(len(timesteps) * (1. - denoising_strength))
+            if not morph:
+                latent_noise_factor = timesteps[first_step]/1000
+                latents  = original_image_latents  * (1.0 - latent_noise_factor) + torch.randn_like(original_image_latents) * latent_noise_factor 
+                timesteps = timesteps[first_step:]
+                self.scheduler.timesteps = timesteps
+                self.scheduler.sigmas= self.scheduler.sigmas[first_step:]
 
         # 6. Denoising loop
         self.scheduler.set_begin_index(0)
@@ -797,10 +829,15 @@ class QwenImagePipeline(): #DiffusionPipeline
             update_loras_slists(self.transformer, loras_slists, updated_num_steps)
             callback(-1, None, True, override_num_inference_steps = updated_num_steps)
 
+
         for i, t in enumerate(timesteps):
             if self.interrupt:
                 continue
 
+            if image_mask_latents is not None and denoising_strength <1. and i == first_step and morph:
+                latent_noise_factor = t/1000
+                latents  = original_image_latents  * (1.0 - latent_noise_factor) + latents * latent_noise_factor 
+
             self._current_timestep = t
             # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
             timestep = t.expand(latents.shape[0]).to(latents.dtype)
@@ -865,6 +902,12 @@ class QwenImagePipeline(): #DiffusionPipeline
             # compute the previous noisy sample x_t -> x_t-1
             latents_dtype = latents.dtype
             latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
+            if image_mask_latents is not None:
+                next_t = timesteps[i+1] if i<len(timesteps)-1 else 0
+                latent_noise_factor = next_t / 1000
+                noisy_image  = original_image_latents  * (1.0 - latent_noise_factor) + torch.randn_like(original_image_latents) * latent_noise_factor 
+                latents  =  noisy_image * (1-image_mask_latents)  + image_mask_latents * latents
+                noisy_image = None
 
             if latents.dtype != latents_dtype:
                 if torch.backends.mps.is_available():
@@ -878,7 +921,7 @@ class QwenImagePipeline(): #DiffusionPipeline
 
         self._current_timestep = None
         if output_type == "latent":
-            image = latents
+            output_image = latents
         else:
             latents = self._unpack_latents(latents, height, width, self.vae_scale_factor)
             latents = latents.to(self.vae.dtype)
@@ -891,7 +934,9 @@ class QwenImagePipeline(): #DiffusionPipeline
                 latents.device, latents.dtype
             )
             latents = latents / latents_std + latents_mean
-            image = self.vae.decode(latents, return_dict=False)[0][:, :, 0]
+            output_image = self.vae.decode(latents, return_dict=False)[0][:, :, 0]
+            if image_mask is not None:
+                output_image = image.squeeze(2) * (1 - image_mask_rebuilt) + output_image.to(image) * image_mask_rebuilt 
 
 
-        return image
+        return output_image
diff --git a/models/qwen/qwen_handler.py b/models/qwen/qwen_handler.py
index 6fc488a..7a8bda1 100644
--- a/models/qwen/qwen_handler.py
+++ b/models/qwen/qwen_handler.py
@@ -9,7 +9,7 @@ def get_qwen_text_encoder_filename(text_encoder_quantization):
 class family_handler():
     @staticmethod
     def query_model_def(base_model_type, model_def):
-        model_def_output = {
+        extra_model_def = {
             "image_outputs" : True,
             "sample_solvers":[
                             ("Default", "default"),
@@ -18,8 +18,18 @@ class family_handler():
             "lock_image_refs_ratios": True,
         }
 
+        if base_model_type in ["qwen_image_edit_20B"]: 
+            extra_model_def["inpaint_support"] = True
+            extra_model_def["image_ref_choices"] = {
+            "choices": [
+                ("None", ""),
+                ("Conditional Images is first Main Subject / Landscape and may be followed by People / Objects", "KI"),
+                ("Conditional Images are People / Objects", "I"),
+                ],
+            "letters_filter": "KI",
+            }
 
-        return model_def_output
+        return extra_model_def
 
     @staticmethod
     def query_supported_types():
@@ -75,14 +85,18 @@ class family_handler():
         if ui_defaults.get("sample_solver", "") == "": 
             ui_defaults["sample_solver"] = "default"
 
+        if settings_version < 2.32:
+            ui_defaults["denoising_strength"] = 1.
+                            
     @staticmethod
     def update_default_settings(base_model_type, model_def, ui_defaults):
         ui_defaults.update({
             "guidance_scale":  4,
             "sample_solver": "default",
         })            
-        if model_def.get("reference_image", False):
+        if base_model_type in ["qwen_image_edit_20B"]: 
             ui_defaults.update({
                 "video_prompt_type": "KI",
+                "denoising_strength" : 1.,
             })
 
diff --git a/models/qwen/qwen_main.py b/models/qwen/qwen_main.py
index 156eeed..e4c19ed 100644
--- a/models/qwen/qwen_main.py
+++ b/models/qwen/qwen_main.py
@@ -103,6 +103,8 @@ class model_factory():
         n_prompt = None,
         sampling_steps: int = 20,
         input_ref_images = None,
+        image_guide= None,
+        image_mask= None,
         width= 832,
         height=480,
         guide_scale: float = 4,
@@ -114,6 +116,7 @@ class model_factory():
         VAE_tile_size = None, 
         joint_pass = True,
         sample_solver='default',
+        denoising_strength = 1.,
         **bbargs
     ):
         # Generate with different aspect ratios
@@ -174,8 +177,9 @@ class model_factory():
 
         if n_prompt is None or len(n_prompt) == 0:
             n_prompt=  "text, watermark, copyright, blurry, low resolution"
-
-        if input_ref_images is not None:
+        if image_guide is not None:
+            input_ref_images = [image_guide] 
+        elif input_ref_images is not None:
             # image stiching method
             stiched = input_ref_images[0]
             if "K" in video_prompt_type :
@@ -190,6 +194,7 @@ class model_factory():
             prompt=input_prompt,
             negative_prompt=n_prompt,
             image = input_ref_images,
+            image_mask = image_mask,
             width=width,
             height=height,
             num_inference_steps=sampling_steps,
@@ -199,6 +204,7 @@ class model_factory():
             pipeline=self,
             loras_slists=loras_slists,
             joint_pass = joint_pass,
+            denoising_strength=denoising_strength,
             generator=torch.Generator(device="cuda").manual_seed(seed)
         )        
         if image is None: return None
diff --git a/models/wan/any2video.py b/models/wan/any2video.py
index bb91dc6..fe55ce3 100644
--- a/models/wan/any2video.py
+++ b/models/wan/any2video.py
@@ -261,7 +261,7 @@ class WanAny2V:
     def vace_latent(self, z, m):
         return [torch.cat([zz, mm], dim=0) for zz, mm in zip(z, m)]
 
-    def fit_image_into_canvas(self, ref_img, image_size, canvas_tf_bg, device, fill_max = False, outpainting_dims = None, return_mask = False):
+    def fit_image_into_canvas(self, ref_img, image_size, canvas_tf_bg, device, full_frame = False, outpainting_dims = None, return_mask = False):
         from shared.utils.utils import save_image
         ref_width, ref_height = ref_img.size
         if (ref_height, ref_width) == image_size and outpainting_dims  == None:
@@ -270,18 +270,23 @@ class WanAny2V:
         else:
             if outpainting_dims != None:
                 final_height, final_width = image_size
-                canvas_height, canvas_width, margin_top, margin_left =   get_outpainting_frame_location(final_height, final_width,  outpainting_dims, 8)        
+                canvas_height, canvas_width, margin_top, margin_left =   get_outpainting_frame_location(final_height, final_width,  outpainting_dims, 1)        
             else:
                 canvas_height, canvas_width = image_size
-            scale = min(canvas_height / ref_height, canvas_width / ref_width)
-            new_height = int(ref_height * scale)
-            new_width = int(ref_width * scale)
-            if fill_max  and (canvas_height - new_height) < 16:
+            if full_frame:
                 new_height = canvas_height
-            if fill_max  and (canvas_width - new_width) < 16:
                 new_width = canvas_width
-            top = (canvas_height - new_height) // 2
-            left = (canvas_width - new_width) // 2
+                top = left = 0 
+            else:
+                # if fill_max  and (canvas_height - new_height) < 16:
+                #     new_height = canvas_height
+                # if fill_max  and (canvas_width - new_width) < 16:
+                #     new_width = canvas_width
+                scale = min(canvas_height / ref_height, canvas_width / ref_width)
+                new_height = int(ref_height * scale)
+                new_width = int(ref_width * scale)
+                top = (canvas_height - new_height) // 2
+                left = (canvas_width - new_width) // 2
             ref_img = ref_img.resize((new_width, new_height), resample=Image.Resampling.LANCZOS) 
             ref_img = TF.to_tensor(ref_img).sub_(0.5).div_(0.5).unsqueeze(1)
             if outpainting_dims != None:
@@ -302,7 +307,7 @@ class WanAny2V:
                 canvas = canvas.to(device)
         return ref_img.to(device), canvas
 
-    def prepare_source(self, src_video, src_mask, src_ref_images, total_frames, image_size,  device, keep_video_guide_frames= [], start_frame = 0,  fit_into_canvas = None, pre_src_video = None, inject_frames = [], outpainting_dims = None, any_background_ref = False):
+    def prepare_source(self, src_video, src_mask, src_ref_images, total_frames, image_size,  device, keep_video_guide_frames= [], start_frame = 0, pre_src_video = None, inject_frames = [], outpainting_dims = None, any_background_ref = False):
         image_sizes = []
         trim_video_guide = len(keep_video_guide_frames)
         def conv_tensor(t, device):
@@ -533,22 +538,16 @@ class WanAny2V:
             any_end_frame = False
             if image_start is None:
                 if infinitetalk:
+                    new_shot = "Q" in video_prompt_type
                     if input_frames is not None:
                         image_ref = input_frames[:, 0]
-                        if input_video is None: input_video = input_frames[:, 0:1]
-                        new_shot = "Q" in video_prompt_type
                     else:
-                        if pre_video_frame is None:
-                            new_shot = True
-                        else:
-                            if input_ref_images is None:
-                                input_ref_images, new_shot = [pre_video_frame], False
-                            else:
-                                input_ref_images, new_shot = [img.resize(pre_video_frame.size, resample=Image.Resampling.LANCZOS) for img in input_ref_images], "Q" in video_prompt_type
-                        if input_ref_images is None: raise Exception("Missing Reference Image")
+                        if input_ref_images is None:                        
+                            if pre_video_frame is None: raise Exception("Missing Reference Image")
+                            input_ref_images = [pre_video_frame]
                         new_shot = new_shot and window_no <= len(input_ref_images)
                         image_ref = convert_image_to_tensor(input_ref_images[ min(window_no, len(input_ref_images))-1 ])
-                    if new_shot:  
+                    if new_shot or input_video is None:  
                         input_video = image_ref.unsqueeze(1)
                     else:
                         color_correction_strength = 0 #disable color correction as transition frames between shots may have a complete different color level than the colors of the new shot
diff --git a/models/wan/df_handler.py b/models/wan/df_handler.py
index 82e704a..809a2cc 100644
--- a/models/wan/df_handler.py
+++ b/models/wan/df_handler.py
@@ -35,7 +35,7 @@ class family_handler():
                     "label" : "Generation Type"
         }
 
-        extra_model_def["image_prompt_types_allowed"] = "TSEV"
+        extra_model_def["image_prompt_types_allowed"] = "TSV"
 
 
         return extra_model_def 
diff --git a/models/wan/wan_handler.py b/models/wan/wan_handler.py
index bc2eb39..24d1f12 100644
--- a/models/wan/wan_handler.py
+++ b/models/wan/wan_handler.py
@@ -110,19 +110,79 @@ class family_handler():
         "tea_cache" : not (base_model_type in ["i2v_2_2", "ti2v_2_2" ] or multiple_submodels),
         "mag_cache" : True,
         "keep_frames_video_guide_not_supported": base_model_type in ["infinitetalk"],
+        "convert_image_guide_to_video" : True,
         "sample_solvers":[
                             ("unipc", "unipc"),
                             ("euler", "euler"),
                             ("dpm++", "dpm++"),
                             ("flowmatch causvid", "causvid"), ]
         })
+
+
+        if base_model_type in ["t2v"]: 
+            extra_model_def["guide_custom_choices"] = {
+                "choices":[("Use Text Prompt Only", ""),("Video to Video guided by Text Prompt", "GUV")],
+                "default": "",
+                "letters_filter": "GUV",
+                "label": "Video to Video"
+            }
+
         if base_model_type in ["infinitetalk"]: 
             extra_model_def["no_background_removal"] = True
-            # extra_model_def["at_least_one_image_ref_needed"] = True
+            extra_model_def["all_image_refs_are_background_ref"] = True
+            extra_model_def["guide_custom_choices"] = {
+            "choices":[
+                ("Images to Video, each Reference Image will start a new shot with a new Sliding Window - Sharp Transitions", "QKI"),
+                ("Images to Video, each Reference Image will start a new shot with a new Sliding Window - Smooth Transitions", "KI"),
+                ("Sparse Video to Video, one Image will by extracted from Video for each new Sliding Window - Sharp Transitions", "QRUV"),
+                ("Sparse Video to Video, one Image will by extracted from Video for each new Sliding Window - Smooth Transitions", "RUV"),
+                ("Video to Video, amount of motion transferred depends on Denoising Strength - Sharp Transitions", "GQUV"),
+                ("Video to Video, amount of motion transferred depends on Denoising Strength - Smooth Transitions", "GUV"),
+            ],
+            "default": "KI",
+            "letters_filter": "RGUVQKI",
+            "label": "Video to Video",
+            "show_label" : False,
+            }
+
+            # extra_model_def["at_least_one_image_ref_needed"] = True
+        if vace_class:
+            extra_model_def["guide_preprocessing"] = {
+                    "selection": ["", "UV", "PV", "DV", "SV", "LV", "CV", "MV", "V", "PDV", "PSV", "PLV" , "DSV", "DLV", "SLV"],
+                    "labels" : { "V": "Use Vace raw format"}
+                }
+            extra_model_def["mask_preprocessing"] = {
+                    "selection": ["", "A", "NA", "XA", "XNA", "YA", "YNA", "WA", "WNA", "ZA", "ZNA"],
+                }
+
+            extra_model_def["image_ref_choices"] = {
+                    "choices": [("None", ""),
+                    ("Inject only People / Objects", "I"),
+                    ("Inject Landscape and then People / Objects", "KI"),
+                    ("Inject Frames and then People / Objects", "FI"),
+                    ],
+                    "letters_filter":  "KFI",
+            }
 
-        if base_model_type in ["standin"] or vace_class: 
             extra_model_def["lock_image_refs_ratios"] = True
 
+        if base_model_type in ["standin"]: 
+            extra_model_def["lock_image_refs_ratios"] = True
+            extra_model_def["image_ref_choices"] = {
+                "choices": [
+                    ("No Reference Image", ""),
+                    ("Reference Image is a Person Face", "I"),
+                    ],
+                "letters_filter":"I",
+            }
+
+        if base_model_type in ["phantom_1.3B", "phantom_14B"]: 
+            extra_model_def["image_ref_choices"] = {
+                "choices": [("Reference Image", "I")],
+                "letters_filter":"I",
+                "visible": False,
+            }
+
         if base_model_type in ["recam_1.3B"]: 
             extra_model_def["keep_frames_video_guide_not_supported"] = True
             extra_model_def["model_modes"] = {
@@ -141,10 +201,18 @@ class family_handler():
                         "default": 1,
                         "label" : "Camera Movement Type"
             }
+            extra_model_def["guide_preprocessing"] = {
+                    "selection": ["UV"],
+                    "labels" : { "UV": "Control Video"},
+                    "visible" : False,
+                }
+
         if vace_class or base_model_type in ["infinitetalk"]:
             image_prompt_types_allowed = "TVL"
         elif base_model_type in ["ti2v_2_2"]:
-            image_prompt_types_allowed = "TSEVL"
+            image_prompt_types_allowed = "TSVL"
+        elif test_multitalk(base_model_type) or base_model_type in ["fantasy"]:
+            image_prompt_types_allowed = "SVL"
         elif i2v:
             image_prompt_types_allowed = "SEVL"
         else:
diff --git a/preprocessing/matanyone/app.py b/preprocessing/matanyone/app.py
index 151d9be..276abad 100644
--- a/preprocessing/matanyone/app.py
+++ b/preprocessing/matanyone/app.py
@@ -7,7 +7,6 @@ import psutil
 # import ffmpeg
 import imageio
 from PIL import Image
-
 import cv2
 import torch
 import torch.nn.functional as F
@@ -33,6 +32,8 @@ model_in_GPU = False
 matanyone_in_GPU = False
 bfloat16_supported = False
 # SAM generator
+import copy
+
 class MaskGenerator():
     def __init__(self, sam_checkpoint, device):
         global args_device
@@ -89,6 +90,7 @@ def get_frames_from_image(image_input, image_state):
         "last_frame_numer": 0,
         "fps": None
         }
+        
     image_info = "Image Name: N/A,\nFPS: N/A,\nTotal Frames: {},\nImage Size:{}".format(len(frames), image_size)
     set_image_encoder_patch()
     select_SAM()
@@ -717,27 +719,33 @@ def load_unload_models(selected):
 def get_vmc_event_handler():
     return load_unload_models
 
-def export_to_vace_video_input(foreground_video_output):
-    gr.Info("Masked Video Input transferred to Vace For Inpainting")
-    return "V#" + str(time.time()), foreground_video_output
 
-
-def export_image(image_refs, image_output):
-    gr.Info("Masked Image transferred to Current Video")
+def export_image(state, image_output):
+    ui_settings = get_current_model_settings(state)
+    image_refs = ui_settings["image_refs"]
     if image_refs == None:
         image_refs =[]
     image_refs.append( image_output)
-    return image_refs
+    ui_settings["image_refs"] = image_refs 
+    gr.Info("Masked Image transferred to Current Image Generator")
+    return time.time()
 
-def export_image_mask(image_input, image_mask):
-    gr.Info("Input Image & Mask transferred to Current Video")
-    return Image.fromarray(image_input), image_mask
+def export_image_mask(state, image_input, image_mask):
+    ui_settings = get_current_model_settings(state)
+    ui_settings["image_guide"] = Image.fromarray(image_input)
+    ui_settings["image_mask"] = image_mask
+
+    gr.Info("Input Image & Mask transferred to Current Image Generator")
+    return time.time()
 
 
-def export_to_current_video_engine( foreground_video_output, alpha_video_output):
+def export_to_current_video_engine(state, foreground_video_output, alpha_video_output):
+    ui_settings = get_current_model_settings(state)
+    ui_settings["video_guide"] = foreground_video_output
+    ui_settings["video_mask"] = alpha_video_output
+
     gr.Info("Original Video and Full Mask have been transferred")
-    # return "MV#" + str(time.time()), foreground_video_output, alpha_video_output
-    return foreground_video_output, alpha_video_output
+    return time.time()
 
 
 def teleport_to_video_tab(tab_state):
@@ -746,9 +754,10 @@ def teleport_to_video_tab(tab_state):
     return gr.Tabs(selected="video_gen")
 
 
-def display(tabs, tab_state, server_config,  vace_video_input, vace_image_input, vace_video_mask, vace_image_mask, vace_image_refs):
+def display(tabs, tab_state, state, refresh_form_trigger, server_config, get_current_model_settings_fn): #,  vace_video_input, vace_image_input, vace_video_mask, vace_image_mask, vace_image_refs):
     # my_tab.select(fn=load_unload_models, inputs=[], outputs=[])
-    global image_output_codec, video_output_codec
+    global image_output_codec, video_output_codec, get_current_model_settings
+    get_current_model_settings = get_current_model_settings_fn
 
     image_output_codec = server_config.get("image_output_codec", None)
     video_output_codec = server_config.get("video_output_codec", None)
@@ -871,7 +880,7 @@ def display(tabs, tab_state, server_config,  vace_video_input, vace_image_input,
                             template_frame = gr.Image(label="Start Frame", type="pil",interactive=True, elem_id="template_frame", visible=False, elem_classes="image")
                             with gr.Row():
                                 clear_button_click = gr.Button(value="Clear Clicks", interactive=True, visible=False,  min_width=100)
-                                add_mask_button = gr.Button(value="Set Mask", interactive=True, visible=False, min_width=100)
+                                add_mask_button = gr.Button(value="Add Mask", interactive=True, visible=False, min_width=100)
                                 remove_mask_button = gr.Button(value="Remove Mask", interactive=True, visible=False,  min_width=100) # no use
                                 matting_button = gr.Button(value="Generate Video Matting", interactive=True, visible=False,  min_width=100)
                             with gr.Row():
@@ -892,7 +901,7 @@ def display(tabs, tab_state, server_config,  vace_video_input, vace_image_input,
                             with gr.Row(visible= True):
                                 export_to_current_video_engine_btn = gr.Button("Export to Control Video Input and Video Mask Input", visible= False)
                                     
-                export_to_current_video_engine_btn.click(  fn=export_to_current_video_engine, inputs= [foreground_video_output, alpha_video_output], outputs= [vace_video_input, vace_video_mask]).then( #video_prompt_video_guide_trigger, 
+                export_to_current_video_engine_btn.click(  fn=export_to_current_video_engine, inputs= [state, foreground_video_output, alpha_video_output], outputs= [refresh_form_trigger]).then( #video_prompt_video_guide_trigger, 
                     fn=teleport_to_video_tab, inputs= [tab_state], outputs= [tabs])
 
 
@@ -1089,9 +1098,9 @@ def display(tabs, tab_state, server_config,  vace_video_input, vace_image_input,
                     # with gr.Column(scale=2, visible= True):
                         export_image_mask_btn = gr.Button(value="Set to Control Image & Mask", visible=False, elem_classes="new_button")
 
-                export_image_btn.click(  fn=export_image, inputs= [vace_image_refs, foreground_image_output], outputs= [vace_image_refs]).then( #video_prompt_video_guide_trigger, 
+                export_image_btn.click(  fn=export_image, inputs= [state, foreground_image_output], outputs= [refresh_form_trigger]).then( #video_prompt_video_guide_trigger, 
                     fn=teleport_to_video_tab, inputs= [tab_state], outputs= [tabs])
-                export_image_mask_btn.click(  fn=export_image_mask, inputs= [image_input, alpha_image_output], outputs= [vace_image_input, vace_image_mask]).then( #video_prompt_video_guide_trigger, 
+                export_image_mask_btn.click(  fn=export_image_mask, inputs= [state, image_input, alpha_image_output], outputs= [refresh_form_trigger]).then( #video_prompt_video_guide_trigger, 
                     fn=teleport_to_video_tab, inputs= [tab_state], outputs= [tabs])
 
                 # first step: get the image information 
@@ -1148,5 +1157,21 @@ def display(tabs, tab_state, server_config,  vace_video_input, vace_image_input,
                     outputs=[foreground_image_output, alpha_image_output,foreground_image_output, alpha_image_output,bbox_info, export_image_btn, export_image_mask_btn]
                 )
 
-
-
+                nada = gr.State({})
+                # clear input
+                gr.on(
+                    triggers=[image_input.clear], #image_input.change,
+                    fn=restart,
+                    inputs=[],
+                    outputs=[ 
+                        image_state,
+                        interactive_state,
+                        click_state,
+                        foreground_image_output, alpha_image_output,
+                        template_frame,
+                        image_selection_slider, image_selection_slider, track_pause_number_slider,point_prompt, export_image_btn, export_image_mask_btn, bbox_info, clear_button_click, 
+                        add_mask_button, matting_button, template_frame, foreground_image_output, alpha_image_output, remove_mask_button, export_image_btn, export_image_mask_btn, mask_dropdown, nada, step2_title
+                    ],
+                    queue=False,
+                    show_progress=False)
+                
diff --git a/requirements.txt b/requirements.txt
index 0ae0ab3..1f008c7 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -23,7 +23,7 @@ librosa==0.11.0
 speechbrain==1.0.3
  
 # UI & interaction
-gradio==5.23.0
+gradio==5.29.0
 dashscope
 loguru
 
diff --git a/shared/gradio/gallery.py b/shared/gradio/gallery.py
index 2f30ef1..757da34 100644
--- a/shared/gradio/gallery.py
+++ b/shared/gradio/gallery.py
@@ -4,6 +4,7 @@ from typing import Any, Dict, List, Optional, Sequence, Tuple, Union, Literal
 
 import gradio as gr
 import PIL
+import time
 from PIL import Image as PILImage
 
 FilePath = str
@@ -20,6 +21,9 @@ def get_list( objs):
         return []
     return [ obj[0] if isinstance(obj, tuple) else obj for obj in objs]
 
+def record_last_action(st, last_action):
+    st["last_action"] = last_action
+    st["last_time"] = time.time()
 class AdvancedMediaGallery:
     def __init__(
         self,
@@ -60,9 +64,10 @@ class AdvancedMediaGallery:
         self.state: Optional[gr.State] = None
         self._initial_state: Dict[str, Any] = {
             "items": items,
-            "selected": (len(items) - 1) if items else None,
+            "selected": (len(items) - 1) if items else 0, # None,
             "single": bool(single_image_mode),
             "mode": self.media_mode,
+            "last_action": "",
         }
 
     # ---------------- helpers ----------------
@@ -210,6 +215,13 @@ class AdvancedMediaGallery:
 
     def _on_select(self, state: Dict[str, Any], gallery, evt: gr.SelectData) :
         # Mirror the selected index into state and the gallery (server-side selected_index)
+
+        st = get_state(state)
+        last_time = st.get("last_time", None)
+        if last_time is not None and abs(time.time()- last_time)< 0.5: # crappy trick to detect if onselect is unwanted (buggy gallery)
+            # print(f"ignored:{time.time()}, real {st['selected']}")
+            return gr.update(selected_index=st["selected"]), st
+
         idx = None
         if evt is not None and hasattr(evt, "index"):
             ix = evt.index
@@ -220,17 +232,28 @@ class AdvancedMediaGallery:
                     idx = ix[0] * max(1, int(self.columns)) + ix[1]
                 else:
                     idx = ix[0]
-        st = get_state(state)
         n = len(get_list(gallery))
         sel = idx if (idx is not None and 0 <= idx < n) else None
+        # print(f"image selected evt index:{sel}/{evt.selected}")
         st["selected"] = sel
-        # return gr.update(selected_index=sel), st
-        # return gr.update(), st
-        return st
+        return gr.update(), st
+
+    def _on_upload(self, value: List[Any], state: Dict[str, Any]) :
+        # Fires when users upload via the Gallery itself.
+        # items_filtered = self._filter_items_by_mode(list(value or []))
+        items_filtered = list(value or [])
+        st = get_state(state)
+        new_items = self._paths_from_payload(items_filtered)
+        st["items"] = new_items
+        new_sel = len(new_items) - 1
+        st["selected"] = new_sel
+        record_last_action(st,"add")
+        return gr.update(selected_index=new_sel), st
 
     def _on_gallery_change(self, value: List[Any], state: Dict[str, Any]) :
         # Fires when users add/drag/drop/delete via the Gallery itself.
-        items_filtered = self._filter_items_by_mode(list(value or []))
+        # items_filtered = self._filter_items_by_mode(list(value or []))
+        items_filtered = list(value or [])
         st = get_state(state)
         st["items"] = items_filtered
         # Keep selection if still valid, else default to last
@@ -240,10 +263,9 @@ class AdvancedMediaGallery:
         else:
             new_sel = old_sel
         st["selected"] = new_sel
-        # return gr.update(value=items_filtered, selected_index=new_sel), st
-        # return gr.update(value=items_filtered), st
-
-        return gr.update(), st
+        st["last_action"] ="gallery_change"
+        # print(f"gallery change: set sel {new_sel}")
+        return gr.update(selected_index=new_sel), st
 
     def _on_add(self, files_payload: Any, state: Dict[str, Any], gallery):
         """
@@ -252,7 +274,8 @@ class AdvancedMediaGallery:
         and re-selects the last inserted item.
         """
         # New items (respect image/video mode)
-        new_items = self._filter_items_by_mode(self._paths_from_payload(files_payload))
+        # new_items = self._filter_items_by_mode(self._paths_from_payload(files_payload))
+        new_items = self._paths_from_payload(files_payload)
 
         st = get_state(state)
         cur: List[Any] = get_list(gallery)
@@ -298,30 +321,6 @@ class AdvancedMediaGallery:
                 if k is not None:
                     seen_new.add(k)
 
-        # Remove any existing occurrences of the incoming items from current list,
-        # BUT keep the currently selected item even if it's also in incoming.
-        cur_clean: List[Any] = []
-        # sel_item = cur[sel] if (sel is not None and 0 <= sel < len(cur)) else None
-        # for idx, it in enumerate(cur):
-        #     k = key_of(it)
-        #     if it is sel_item:
-        #         cur_clean.append(it)
-        #         continue
-        #     if k is not None and k in seen_new:
-        #         continue  # drop duplicate; we'll reinsert at the target spot
-        #     cur_clean.append(it)
-
-        # # Compute insertion position: right AFTER the (possibly shifted) selected item
-        # if sel_item is not None:
-        #     # find sel_item's new index in cur_clean
-        #     try:
-        #         pos_sel = cur_clean.index(sel_item)
-        #     except ValueError:
-        #         # Shouldn't happen, but fall back to end
-        #         pos_sel = len(cur_clean) - 1
-        #     insert_pos = pos_sel + 1
-        # else:
-        #     insert_pos = len(cur_clean)  # no selection -> append at end
         insert_pos = min(sel, len(cur) -1)
         cur_clean = cur
         # Build final list and selection
@@ -330,6 +329,8 @@ class AdvancedMediaGallery:
 
         st["items"] = merged
         st["selected"] = new_sel
+        record_last_action(st,"add")
+        # print(f"gallery add: set sel {new_sel}")
         return gr.update(value=merged, selected_index=new_sel), st
 
     def _on_remove(self, state: Dict[str, Any], gallery) :
@@ -342,8 +343,9 @@ class AdvancedMediaGallery:
             return gr.update(value=[], selected_index=None), st
         new_sel = min(sel, len(items) - 1)
         st["items"] = items; st["selected"] = new_sel
-        # return gr.update(value=items, selected_index=new_sel), st
-        return gr.update(value=items), st
+        record_last_action(st,"remove")
+        # print(f"gallery del: new sel {new_sel}")
+        return gr.update(value=items, selected_index=new_sel), st
 
     def _on_move(self, delta: int, state: Dict[str, Any], gallery) :
         st = get_state(state); items: List[Any] = get_list(gallery); sel = st.get("selected", None)
@@ -354,11 +356,15 @@ class AdvancedMediaGallery:
             return gr.update(value=items, selected_index=sel), st
         items[sel], items[j] = items[j], items[sel]
         st["items"] = items; st["selected"] = j
+        record_last_action(st,"move")
+        # print(f"gallery move: set sel {j}")
         return gr.update(value=items, selected_index=j), st
 
     def _on_clear(self, state: Dict[str, Any]) :
         st = {"items": [], "selected": None, "single": get_state(state).get("single", False), "mode": self.media_mode}
-        return gr.update(value=[], selected_index=0), st
+        record_last_action(st,"clear")
+        # print(f"Clear all")
+        return gr.update(value=[], selected_index=None), st
 
     def _on_toggle_single(self, to_single: bool, state: Dict[str, Any]) :
         st = get_state(state); st["single"] = bool(to_single)
@@ -382,30 +388,38 @@ class AdvancedMediaGallery:
     def mount(self, parent: Optional[gr.Blocks | gr.Group | gr.Row | gr.Column] = None, update_form = False):
         if parent is not None:
             with parent:
-                col = self._build_ui()
+                col = self._build_ui(update_form)
         else:
-            col = self._build_ui()
+            col = self._build_ui(update_form)
         if not update_form:
             self._wire_events()
         return col
 
-    def _build_ui(self) -> gr.Column:
+    def _build_ui(self, update = False) -> gr.Column:
         with gr.Column(elem_id=self.elem_id, elem_classes=self.elem_classes) as col:
             self.container = col
 
             self.state = gr.State(dict(self._initial_state))
 
-            self.gallery = gr.Gallery(
-                label=self.label,
-                value=self._initial_state["items"],
-                height=self.height,
-                columns=self.columns,
-                show_label=self.show_label,
-                preview= True,
-                # type="pil",
-                file_types= list(IMAGE_EXTS) if self.media_mode == "image" else list(VIDEO_EXTS), 
-                selected_index=self._initial_state["selected"],  # server-side selection
-            )
+            if update:
+                self.gallery = gr.update(
+                    value=self._initial_state["items"],
+                    selected_index=self._initial_state["selected"],  # server-side selection
+                    label=self.label,
+                    show_label=self.show_label,
+                )
+            else:
+                self.gallery = gr.Gallery(
+                    value=self._initial_state["items"],
+                    label=self.label,
+                    height=self.height,
+                    columns=self.columns,
+                    show_label=self.show_label,
+                    preview= True,
+                    # type="pil", # very slow
+                    file_types= list(IMAGE_EXTS) if self.media_mode == "image" else list(VIDEO_EXTS), 
+                    selected_index=self._initial_state["selected"],  # server-side selection
+                )
 
             # One-line controls
             exts = sorted(IMAGE_EXTS if self.media_mode == "image" else VIDEO_EXTS) if self.accept_filter else None
@@ -418,10 +432,10 @@ class AdvancedMediaGallery:
                     size="sm",
                     min_width=1,
                 )
-                self.btn_remove = gr.Button("Remove", size="sm", min_width=1)
+                self.btn_remove = gr.Button(" Remove ", size="sm", min_width=1)
                 self.btn_left   = gr.Button("◀ Left",  size="sm", visible=not self._initial_state["single"], min_width=1)
                 self.btn_right  = gr.Button("Right ▶", size="sm", visible=not self._initial_state["single"], min_width=1)
-                self.btn_clear  = gr.Button("Clear",   variant="secondary", size="sm", visible=not self._initial_state["single"], min_width=1)
+                self.btn_clear  = gr.Button(" Clear ",   variant="secondary", size="sm", visible=not self._initial_state["single"], min_width=1)
 
         return col
 
@@ -430,14 +444,24 @@ class AdvancedMediaGallery:
         self.gallery.select(
             self._on_select,
             inputs=[self.state, self.gallery],
-            outputs=[self.state],
+            outputs=[self.gallery, self.state],
+            trigger_mode="always_last",
         )
 
         # Gallery value changed by user actions (click-to-add, drag-drop, internal remove, etc.)
-        self.gallery.change(
+        self.gallery.upload(
+            self._on_upload,
+            inputs=[self.gallery, self.state],
+            outputs=[self.gallery, self.state],
+            trigger_mode="always_last",
+        )
+
+        # Gallery value changed by user actions (click-to-add, drag-drop, internal remove, etc.)
+        self.gallery.upload(
             self._on_gallery_change,
             inputs=[self.gallery, self.state],
             outputs=[self.gallery, self.state],
+            trigger_mode="always_last",
         )
 
         # Add via UploadButton
@@ -445,6 +469,7 @@ class AdvancedMediaGallery:
             self._on_add,
             inputs=[self.upload_btn, self.state, self.gallery],
             outputs=[self.gallery, self.state],
+            trigger_mode="always_last",
         )
 
         # Remove selected
@@ -452,6 +477,7 @@ class AdvancedMediaGallery:
             self._on_remove,
             inputs=[self.state, self.gallery],
             outputs=[self.gallery, self.state],
+            trigger_mode="always_last",
         )
 
         # Reorder using selected index, keep same item selected
@@ -459,11 +485,13 @@ class AdvancedMediaGallery:
             lambda st, gallery: self._on_move(-1, st, gallery),
             inputs=[self.state, self.gallery],
             outputs=[self.gallery, self.state],
+            trigger_mode="always_last",
         )
         self.btn_right.click(
             lambda st, gallery: self._on_move(+1, st, gallery),
             inputs=[self.state, self.gallery],
             outputs=[self.gallery, self.state],
+            trigger_mode="always_last",
         )
 
         # Clear all
@@ -471,6 +499,7 @@ class AdvancedMediaGallery:
             self._on_clear,
             inputs=[self.state],
             outputs=[self.gallery, self.state],
+            trigger_mode="always_last",
         )
 
     # ---------------- public API ----------------
diff --git a/shared/utils/utils.py b/shared/utils/utils.py
index 7ddf1eb..5dbd0af 100644
--- a/shared/utils/utils.py
+++ b/shared/utils/utils.py
@@ -19,6 +19,7 @@ import tempfile
 import subprocess
 import json
 from functools import lru_cache
+os.environ["U2NET_HOME"] = os.path.join(os.getcwd(), "ckpts", "rembg")
 
 
 from PIL import Image
@@ -207,30 +208,62 @@ def  get_outpainting_frame_location(final_height, final_width,  outpainting_dims
     if (margin_left + width) > final_width or outpainting_right == 0: margin_left = final_width - width
     return height, width, margin_top, margin_left
 
-def calculate_new_dimensions(canvas_height, canvas_width, image_height, image_width, fit_into_canvas, block_size = 16):
-    if fit_into_canvas == None:
+def rescale_and_crop(img, w, h):
+    ow, oh = img.size
+    target_ratio = w / h
+    orig_ratio = ow / oh
+    
+    if orig_ratio > target_ratio:
+        # Crop width first
+        nw = int(oh * target_ratio)
+        img = img.crop(((ow - nw) // 2, 0, (ow + nw) // 2, oh))
+    else:
+        # Crop height first
+        nh = int(ow / target_ratio)
+        img = img.crop((0, (oh - nh) // 2, ow, (oh + nh) // 2))
+    
+    return img.resize((w, h), Image.LANCZOS)
+
+def calculate_new_dimensions(canvas_height, canvas_width, image_height, image_width, fit_into_canvas,  block_size = 16):
+    if fit_into_canvas == None or fit_into_canvas == 2:
         # return image_height, image_width
         return canvas_height, canvas_width
-    if fit_into_canvas:
+    if fit_into_canvas == 1:
         scale1  = min(canvas_height / image_height, canvas_width / image_width)
         scale2  = min(canvas_width / image_height, canvas_height / image_width)
         scale = max(scale1, scale2) 
-    else:
+    else: #0 or #2 (crop)
         scale = (canvas_height * canvas_width / (image_height * image_width))**(1/2)
 
     new_height = round( image_height * scale / block_size) * block_size
     new_width = round( image_width * scale / block_size) * block_size
     return new_height, new_width
 
-def resize_and_remove_background(img_list, budget_width, budget_height, rm_background, ignore_first, fit_into_canvas = False ):
+def calculate_dimensions_and_resize_image(image, canvas_height, canvas_width, fit_into_canvas, fit_crop, block_size = 16):
+    if fit_crop:
+        image = rescale_and_crop(image, canvas_width, canvas_height)
+        new_width, new_height = image.size  
+    else:
+        image_width, image_height = image.size
+        new_height, new_width = calculate_new_dimensions(canvas_height, canvas_width, image_height, image_width, fit_into_canvas, block_size = block_size )
+        image = image.resize((new_width, new_height), resample=Image.Resampling.LANCZOS) 
+    return image, new_height, new_width
+
+def resize_and_remove_background(img_list, budget_width, budget_height, rm_background, any_background_ref, fit_into_canvas = 0, block_size= 16, outpainting_dims = None ):
     if rm_background:
         session = new_session() 
 
     output_list =[]
     for i, img in enumerate(img_list):
         width, height =  img.size 
-
-        if fit_into_canvas:
+        if fit_into_canvas == None or any_background_ref == 1 and i==0 or any_background_ref == 2:
+            if outpainting_dims is not None:
+                resized_image =img 
+            elif img.size != (budget_width, budget_height):
+                resized_image= img.resize((budget_width, budget_height), resample=Image.Resampling.LANCZOS) 
+            else:
+                resized_image =img
+        elif fit_into_canvas == 1:
             white_canvas = np.ones((budget_height, budget_width, 3), dtype=np.uint8) * 255 
             scale = min(budget_height / height, budget_width / width)
             new_height = int(height * scale)
@@ -242,10 +275,10 @@ def resize_and_remove_background(img_list, budget_width, budget_height, rm_backg
             resized_image = Image.fromarray(white_canvas)  
         else:
             scale = (budget_height * budget_width / (height * width))**(1/2)
-            new_height = int( round(height * scale / 16) * 16)
-            new_width = int( round(width * scale / 16) * 16)
+            new_height = int( round(height * scale / block_size) * block_size)
+            new_width = int( round(width * scale / block_size) * block_size)
             resized_image= img.resize((new_width,new_height), resample=Image.Resampling.LANCZOS) 
-        if rm_background  and not (ignore_first and i == 0) :
+        if rm_background  and not (any_background_ref and i==0 or any_background_ref == 2) :
             # resized_image = remove(resized_image, session=session, alpha_matting_erode_size = 1,alpha_matting_background_threshold = 70, alpha_foreground_background_threshold = 100, alpha_matting = True, bgcolor=[255, 255, 255, 0]).convert('RGB')
             resized_image = remove(resized_image, session=session, alpha_matting_erode_size = 1, alpha_matting = True, bgcolor=[255, 255, 255, 0]).convert('RGB')
         output_list.append(resized_image) #alpha_matting_background_threshold = 30, alpha_foreground_background_threshold = 200,
diff --git a/wgp.py b/wgp.py
index 396e273..45cb5cf 100644
--- a/wgp.py
+++ b/wgp.py
@@ -1,4 +1,5 @@
 import os
+os.environ["GRADIO_LANG"] = "en"
 # # os.environ.pop("TORCH_LOGS", None)  # make sure no env var is suppressing/overriding
 # os.environ["TORCH_LOGS"]= "recompiles"
 import torch._logging as tlog
@@ -21,7 +22,7 @@ import numpy as np
 import importlib
 from shared.utils import notification_sound
 from shared.utils.loras_mutipliers import preparse_loras_multipliers, parse_loras_multipliers
-from shared.utils.utils import convert_tensor_to_image, save_image, get_video_info, get_file_creation_date, convert_image_to_video, calculate_new_dimensions, convert_image_to_tensor, get_video_frame
+from shared.utils.utils import convert_tensor_to_image, save_image, get_video_info, get_file_creation_date, convert_image_to_video, calculate_new_dimensions, convert_image_to_tensor, calculate_dimensions_and_resize_image, rescale_and_crop, get_video_frame, resize_and_remove_background
 from shared.utils.audio_video import extract_audio_tracks, combine_video_with_audio_tracks, combine_and_concatenate_video_with_audio_tracks, cleanup_temp_audio_files,  save_video, save_image
 from shared.utils.audio_video import save_image_metadata, read_image_metadata
 from shared.match_archi import match_nvidia_architecture
@@ -61,7 +62,7 @@ PROMPT_VARS_MAX = 10
 
 target_mmgp_version = "3.6.0"
 WanGP_version = "8.4"
-settings_version = 2.31
+settings_version = 2.33
 max_source_video_frames = 3000
 prompt_enhancer_image_caption_model, prompt_enhancer_image_caption_processor, prompt_enhancer_llm_model, prompt_enhancer_llm_tokenizer = None, None, None, None
 
@@ -220,7 +221,7 @@ def process_prompt_and_add_tasks(state, model_choice):
         return get_queue_table(queue)
     model_def = get_model_def(model_type)
     model_handler = get_model_handler(model_type)
-    image_outputs = inputs["image_mode"] == 1
+    image_outputs = inputs["image_mode"] > 0
     any_steps_skipping = model_def.get("tea_cache", False) or model_def.get("mag_cache", False)
     model_type = get_base_model_type(model_type)
     inputs["model_filename"] = model_filename
@@ -370,7 +371,7 @@ def process_prompt_and_add_tasks(state, model_choice):
             gr.Info("Mag Cache maximum number of steps is 50")
             return
         
-    if image_mode == 1:
+    if image_mode > 0:
         audio_prompt_type = ""
 
     if "B" in audio_prompt_type or "X" in audio_prompt_type:
@@ -477,7 +478,8 @@ def process_prompt_and_add_tasks(state, model_choice):
             image_mask = None
 
         if "G" in video_prompt_type:
-            gr.Info(f"With Denoising Strength {denoising_strength:.1f}, denoising will start at Step no {int(num_inference_steps * (1. - denoising_strength))} ")
+            if image_mode == 0:
+                gr.Info(f"With Denoising Strength {denoising_strength:.1f}, denoising will start at Step no {int(num_inference_steps * (1. - denoising_strength))} ")
         else: 
             denoising_strength = 1.0
         if len(keep_frames_video_guide) > 0 and model_type in ["ltxv_13B"]:
@@ -1334,9 +1336,11 @@ def update_queue_data(queue):
     data = get_queue_table(queue)
 
     if len(data) == 0:
-        return gr.DataFrame(visible=False)
+        return gr.DataFrame(value=[], max_height=1)
+    elif len(data) == 1:
+        return gr.DataFrame(value=data, max_height= 83)
     else:
-        return gr.DataFrame(value=data, visible= True)
+        return gr.DataFrame(value=data, max_height= 1000)
 
 def create_html_progress_bar(percentage=0.0, text="Idle", is_idle=True):
     bar_class = "progress-bar-custom idle" if is_idle else "progress-bar-custom"
@@ -1371,6 +1375,12 @@ def _parse_args():
         help="save proprocessed audio track with extract speakers for debugging or editing"
     )
 
+    parser.add_argument(
+        "--debug-gen-form",
+        action="store_true",
+        help="View form generation / refresh time"
+    )
+
     parser.add_argument(
         "--vram-safety-coefficient",
         type=float,
@@ -2070,8 +2080,6 @@ def fix_settings(model_type, ui_defaults):
     if image_prompt_type != None :
         if not isinstance(image_prompt_type, str):
             image_prompt_type = "S" if image_prompt_type  == 0 else "SE"
-        # if model_type == "flf2v_720p" and not "E" in image_prompt_type:
-        #     image_prompt_type = "SE"
         if settings_version <= 2:
             image_prompt_type = image_prompt_type.replace("G","")
         ui_defaults["image_prompt_type"] = image_prompt_type
@@ -2091,10 +2099,7 @@ def fix_settings(model_type, ui_defaults):
 
 
     video_prompt_type = ui_defaults.get("video_prompt_type", "")
-    any_reference_image = model_def.get("reference_image", False)
-    if base_model_type in ["hunyuan_custom", "hunyuan_custom_edit", "hunyuan_custom_audio", "hunyuan_avatar", "phantom_14B", "phantom_1.3B"] or any_reference_image:
-        if not "I" in video_prompt_type:  # workaround for settings corruption
-            video_prompt_type += "I" 
+
     if base_model_type in ["hunyuan"]:
         video_prompt_type = video_prompt_type.replace("I", "")
 
@@ -2133,10 +2138,27 @@ def fix_settings(model_type, ui_defaults):
         del ui_defaults["tea_cache_start_step_perc"]
         ui_defaults["skip_steps_start_step_perc"] = tea_cache_start_step_perc
 
+    image_prompt_type = ui_defaults.get("image_prompt_type", "")
+    if len(image_prompt_type) > 0:
+        image_prompt_types_allowed = model_def.get("image_prompt_types_allowed","")
+        image_prompt_type = filter_letters(image_prompt_type, image_prompt_types_allowed)
+    ui_defaults["image_prompt_type"] = image_prompt_type
+
+    video_prompt_type = ui_defaults.get("video_prompt_type", "")
+    image_ref_choices_list = model_def.get("image_ref_choices", {}).get("choices", [])
+    if len(image_ref_choices_list)==0:
+        video_prompt_type = del_in_sequence(video_prompt_type, "IK")
+    else:
+        first_choice = image_ref_choices_list[0][1]
+        if "I" in first_choice and not "I" in video_prompt_type: video_prompt_type += "I"
+        if len(image_ref_choices_list)==1 and "K" in first_choice and not "K" in video_prompt_type: video_prompt_type += "K"
+    ui_defaults["video_prompt_type"] = video_prompt_type
+
     model_handler = get_model_handler(base_model_type)
     if hasattr(model_handler, "fix_settings"):
         model_handler.fix_settings(base_model_type, settings_version, model_def, ui_defaults)
 
+
 def get_default_settings(model_type):
     def get_default_prompt(i2v):
         if i2v:
@@ -3323,8 +3345,8 @@ def select_video(state, input_file_list, event_data: gr.EventData):
                 if not all_letters(src, pos): return False
                 if neg is not None and any_letters(src, neg): return False
                 return True
-            image_outputs = configs.get("image_mode",0) == 1
-            map_video_prompt  = {"V" : "Control Video", ("VA", "U") : "Mask Video", "I" : "Reference Images"}
+            image_outputs = configs.get("image_mode",0) > 0
+            map_video_prompt  = {"V" : "Control Image" if image_outputs else "Control Video", ("VA", "U") : "Mask Image" if image_outputs else "Mask Video", "I" : "Reference Images"}
             map_image_prompt  = {"V" : "Source Video", "L" : "Last Video", "S" : "Start Image", "E" : "End Image"}
             map_audio_prompt  = {"A" : "Audio Source", "B" : "Audio Source #2"}
             video_other_prompts =  [ v for s,v in map_image_prompt.items() if all_letters(video_image_prompt_type,s)] \
@@ -3571,9 +3593,27 @@ def process_images_multithread(image_processor, items, process_type, wrap_in_lis
     end_time = time.time()
     # print(f"duration:{end_time-start_time:.1f}")
 
-    return results  
+    return results
 
-def preprocess_video_with_mask(input_video_path, input_mask_path, height, width,  max_frames, start_frame=0, fit_canvas = False, target_fps = 16, block_size= 16, expand_scale = 2, process_type = "inpaint", process_type2 = None, to_bbox = False, RGB_Mask = False, negate_mask = False, process_outside_mask = None, inpaint_color = 127, outpainting_dims = None, proc_no = 1):
+def preprocess_image_with_mask(input_image, input_mask, height, width, fit_canvas = False,  block_size= 16, expand_scale = 2):
+    frame_width, frame_height = input_image.size
+
+    if fit_canvas != None:
+        height, width = calculate_new_dimensions(height, width, frame_height, frame_width, fit_into_canvas = fit_canvas, block_size = block_size)
+        input_image = input_image.resize((width, height), resample=Image.Resampling.LANCZOS)
+        if input_mask is not None:
+            input_mask = input_mask.resize((width, height), resample=Image.Resampling.LANCZOS)
+
+    if expand_scale != 0 and input_mask is not None:
+        kernel_size = abs(expand_scale)
+        kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (kernel_size, kernel_size))
+        op_expand = cv2.dilate if expand_scale > 0 else cv2.erode
+        input_mask = np.array(input_mask) 
+        input_mask = op_expand(input_mask, kernel, iterations=3)
+        input_mask = Image.fromarray(input_mask)
+    return input_image, input_mask
+
+def preprocess_video_with_mask(input_video_path, input_mask_path, height, width,  max_frames, start_frame=0, fit_canvas = None, fit_crop = False, target_fps = 16, block_size= 16, expand_scale = 2, process_type = "inpaint", process_type2 = None, to_bbox = False, RGB_Mask = False, negate_mask = False, process_outside_mask = None, inpaint_color = 127, outpainting_dims = None, proc_no = 1):
     from shared.utils.utils import calculate_new_dimensions, get_outpainting_frame_location, get_outpainting_full_area_dimensions
 
     def mask_to_xyxy_box(mask):
@@ -3615,6 +3655,9 @@ def preprocess_video_with_mask(input_video_path, input_mask_path, height, width,
 
     if len(video) == 0 or any_mask and len(mask_video) == 0:
         return None, None
+    if fit_crop and outpainting_dims != None:
+        fit_crop = False
+        fit_canvas = 0 if fit_canvas is not None else None
 
     frame_height, frame_width, _ = video[0].shape
 
@@ -3629,7 +3672,7 @@ def preprocess_video_with_mask(input_video_path, input_mask_path, height, width,
 
     if outpainting_dims != None:
         final_height, final_width = height, width
-        height, width, margin_top, margin_left =  get_outpainting_frame_location(final_height, final_width,  outpainting_dims, 8)        
+        height, width, margin_top, margin_left =  get_outpainting_frame_location(final_height, final_width,  outpainting_dims, 1)        
 
     if any_mask:
         num_frames = min(len(video), len(mask_video))
@@ -3646,14 +3689,20 @@ def preprocess_video_with_mask(input_video_path, input_mask_path, height, width,
     # for frame_idx in range(num_frames):
     def prep_prephase(frame_idx):
         frame = Image.fromarray(video[frame_idx].cpu().numpy()) #.asnumpy()
-        frame = frame.resize((width, height), resample=Image.Resampling.LANCZOS) 
+        if fit_crop:
+            frame = rescale_and_crop(frame, width, height)
+        else:
+            frame = frame.resize((width, height), resample=Image.Resampling.LANCZOS) 
         frame = np.array(frame) 
         if any_mask:
             if any_identity_mask:
                 mask = np.full( (height, width, 3), 0, dtype= np.uint8)
             else:
                 mask = Image.fromarray(mask_video[frame_idx].cpu().numpy()) #.asnumpy()
-                mask = mask.resize((width, height), resample=Image.Resampling.LANCZOS) 
+                if fit_crop:
+                    mask = rescale_and_crop(mask, width, height)
+                else:
+                    mask = mask.resize((width, height), resample=Image.Resampling.LANCZOS) 
                 mask = np.array(mask)
 
             if len(mask.shape) == 3 and mask.shape[2] == 3:
@@ -3750,14 +3799,14 @@ def preprocess_video_with_mask(input_video_path, input_mask_path, height, width,
 
     return torch.stack(masked_frames), torch.stack(masks) if any_mask else None
 
-def preprocess_video(height, width, video_in, max_frames, start_frame=0, fit_canvas = None, target_fps = 16, block_size = 16):
+def preprocess_video(height, width, video_in, max_frames, start_frame=0, fit_canvas = None, fit_crop = False, target_fps = 16, block_size = 16):
 
     frames_list = get_resampled_video(video_in, start_frame, max_frames, target_fps)
 
     if len(frames_list) == 0:
         return None
 
-    if fit_canvas == None:
+    if fit_canvas == None or fit_crop:
         new_height = height
         new_width = width
     else:
@@ -3775,7 +3824,10 @@ def preprocess_video(height, width, video_in, max_frames, start_frame=0, fit_can
     processed_frames_list = []
     for frame in frames_list:
         frame = Image.fromarray(np.clip(frame.cpu().numpy(), 0, 255).astype(np.uint8))
-        frame = frame.resize((new_width,new_height), resample=Image.Resampling.LANCZOS) 
+        if fit_crop:
+            frame  = rescale_and_crop(frame, new_width, new_height)
+        else:
+            frame = frame.resize((new_width,new_height), resample=Image.Resampling.LANCZOS) 
         processed_frames_list.append(frame)
 
     np_frames = [np.array(frame) for frame in processed_frames_list]
@@ -4115,9 +4167,10 @@ def process_prompt_enhancer(prompt_enhancer, original_prompts,  image_start, ori
     prompt_images = []
     if "I" in prompt_enhancer:
         if image_start != None:
-            prompt_images.append(image_start)
+            prompt_images += image_start
         if original_image_refs != None:
-            prompt_images +=  original_image_refs[:1]
+            prompt_images += original_image_refs[:1]
+    prompt_images = [Image.open(img) if isinstance(img,str) else img for img in prompt_images]
     if len(original_prompts) == 0 and not "T" in prompt_enhancer:
         return None
     else:
@@ -4223,7 +4276,7 @@ def enhance_prompt(state, prompt, prompt_enhancer, multi_images_gen_type, overri
     original_image_refs = inputs["image_refs"]
     if original_image_refs is not None:
         original_image_refs = [ convert_image(tup[0]) for tup in original_image_refs ]        
-    is_image = inputs["image_mode"] == 1
+    is_image = inputs["image_mode"] > 0
     seed = inputs["seed"]
     seed = set_seed(seed)
     enhanced_prompts = []
@@ -4367,7 +4420,7 @@ def generate_video(
 
 
     model_def = get_model_def(model_type) 
-    is_image = image_mode == 1
+    is_image = image_mode > 0
     if is_image:
         if min_frames_if_references >= 1000:
             video_length = min_frames_if_references - 1000
@@ -4377,19 +4430,22 @@ def generate_video(
         batch_size = 1
     temp_filenames_list = []
 
-    if image_guide is not None and isinstance(image_guide, Image.Image):
-        video_guide = convert_image_to_video(image_guide)
-        temp_filenames_list.append(video_guide)
-    image_guide = None
+    convert_image_guide_to_video = model_def.get("convert_image_guide_to_video", False)
+    if convert_image_guide_to_video:
+        if image_guide is not None and isinstance(image_guide, Image.Image):
+            video_guide = convert_image_to_video(image_guide)
+            temp_filenames_list.append(video_guide)
+            image_guide = None
 
-    if image_mask is not None and isinstance(image_mask, Image.Image):
-        video_mask = convert_image_to_video(image_mask)
-        temp_filenames_list.append(video_mask)
-    image_mask = None
+        if image_mask is not None and isinstance(image_mask, Image.Image):
+            video_mask = convert_image_to_video(image_mask)
+            temp_filenames_list.append(video_mask)
+            image_mask = None
 
+    if model_def.get("no_background_removal", False): remove_background_images_ref = 0
+    
     base_model_type = get_base_model_type(model_type)
     model_family = get_model_family(base_model_type)
-    fit_canvas = server_config.get("fit_canvas", 0)
     model_handler = get_model_handler(base_model_type)
     block_size = model_handler.get_vae_block_size(base_model_type) if hasattr(model_handler, "get_vae_block_size") else 16
 
@@ -4415,7 +4471,7 @@ def generate_video(
         return
     
     width, height = resolution.split("x")
-    width, height = int(width), int(height)
+    width, height = int(width) // block_size *  block_size, int(height) // block_size *  block_size
     default_image_size = (height, width)
 
     if slg_switch == 0:
@@ -4530,39 +4586,25 @@ def generate_video(
     original_image_refs = image_refs
     # image_refs = None
     # nb_frames_positions= 0
-    frames_to_inject = []
-    any_background_ref = False
-    outpainting_dims = None if video_guide_outpainting== None or len(video_guide_outpainting) == 0 or video_guide_outpainting == "0 0 0 0" or video_guide_outpainting.startswith("#") else [int(v) for v in video_guide_outpainting.split(" ")] 
     # Output Video Ratio Priorities:
     # Source Video or Start Image > Control Video > Image Ref (background or positioned frames only) >  UI Width, Height
     # Image Ref (non background and non positioned frames) are boxed in a white canvas in order to keep their own width/height ratio
-
-    if image_refs is not None and len(image_refs) > 0:
+    frames_to_inject = []
+    if image_refs is not None:
         frames_positions_list = [ int(pos)-1 for pos in frames_positions.split(" ")] if frames_positions is not None and len(frames_positions)> 0 else []
         frames_positions_list = frames_positions_list[:len(image_refs)]
         nb_frames_positions = len(frames_positions_list) 
-        if nb_frames_positions > 0:
-            frames_to_inject = [None] * (max(frames_positions_list) + 1)
-            for i, pos in enumerate(frames_positions_list):
-                frames_to_inject[pos] = image_refs[i] 
-        if video_guide == None and video_source == None and not "L" in image_prompt_type and (nb_frames_positions > 0 or "K" in video_prompt_type) :
-            from shared.utils.utils import get_outpainting_full_area_dimensions
-            w, h = image_refs[0].size
-            if outpainting_dims != None:
-                h, w = get_outpainting_full_area_dimensions(h,w, outpainting_dims)
-            default_image_size = calculate_new_dimensions(height, width, h, w, fit_canvas)
-            fit_canvas = None
-            # if there is a source video and a background image ref, the height/width ratio will need to be processed later by the code for the model (we dont know the source video dimensions at this point)
-        if len(image_refs) > nb_frames_positions:  
-            any_background_ref = "K" in video_prompt_type 
-            if remove_background_images_ref > 0:
-                send_cmd("progress", [0, get_latest_status(state, "Removing Images References Background")])
-            os.environ["U2NET_HOME"] = os.path.join(os.getcwd(), "ckpts", "rembg")
-            from shared.utils.utils import resize_and_remove_background
-            # keep image ratios if there is a background image ref (we will let the model preprocessor decide what to do) but remove bg if requested
-            image_refs[nb_frames_positions:]  = resize_and_remove_background(image_refs[nb_frames_positions:] , width, height, remove_background_images_ref > 0, any_background_ref, fit_into_canvas= not (any_background_ref or model_def.get("lock_image_refs_ratios", False)) ) # no fit for vace ref images as it is done later
-            update_task_thumbnails(task, locals())
-            send_cmd("output")
+    any_background_ref  = 0
+    if "K" in video_prompt_type: 
+        any_background_ref = 2 if model_def.get("all_image_refs_are_background_ref", False) else 1
+
+    outpainting_dims = None if video_guide_outpainting== None or len(video_guide_outpainting) == 0 or video_guide_outpainting == "0 0 0 0" or video_guide_outpainting.startswith("#") else [int(v) for v in video_guide_outpainting.split(" ")] 
+    fit_canvas = server_config.get("fit_canvas", 0)
+    fit_crop = fit_canvas == 2
+    if fit_crop and outpainting_dims is not None:
+        fit_crop = False
+        fit_canvas = 0
+
     joint_pass = boost ==1 #and profile != 1 and profile != 3  
     
     skip_steps_cache = None if len(skip_steps_cache_type) == 0 else DynamicClass(cache_type = skip_steps_cache_type) 
@@ -4632,6 +4674,9 @@ def generate_video(
         length = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
         current_video_length = min(current_video_length, length)
 
+    if image_guide is not None:
+        image_guide, image_mask = preprocess_image_with_mask(image_guide, image_mask, height, width, fit_canvas = None,  block_size= block_size, expand_scale = mask_expand)
+
     seed = set_seed(seed)
 
     torch.set_grad_enabled(False) 
@@ -4723,22 +4768,22 @@ def generate_video(
             return_latent_slice = None 
             if reuse_frames > 0:                
                 return_latent_slice = slice(-(reuse_frames - 1 + discard_last_frames ) // latent_size - 1, None if discard_last_frames == 0 else -(discard_last_frames // latent_size) )
-            refresh_preview  = {"image_guide" : None, "image_mask" : None}
+            refresh_preview  = {"image_guide" : image_guide, "image_mask" : image_mask} if image_mode >= 1 else {}
 
             src_ref_images  = image_refs
             image_start_tensor = image_end_tensor = None
             if window_no == 1 and (video_source is not None or image_start is not None):
                 if image_start is not None:
-                    new_height, new_width = calculate_new_dimensions(height, width, image_start.height, image_start.width, sample_fit_canvas, block_size = block_size)
-                    image_start_tensor = image_start.resize((new_width, new_height), resample=Image.Resampling.LANCZOS) 
+                    image_start_tensor, new_height, new_width = calculate_dimensions_and_resize_image(image_start, height, width, sample_fit_canvas, fit_crop, block_size = block_size)
+                    if fit_crop: refresh_preview["image_start"] = image_start_tensor 
                     image_start_tensor = convert_image_to_tensor(image_start_tensor)
                     pre_video_guide =  prefix_video = image_start_tensor.unsqueeze(1)
                 else:
-                    if "L" in image_prompt_type:
-                        refresh_preview["video_source"] = get_video_frame(video_source, 0)
-                    prefix_video  = preprocess_video(width=width, height=height,video_in=video_source, max_frames= parsed_keep_frames_video_source , start_frame = 0, fit_canvas= sample_fit_canvas, target_fps = fps, block_size = block_size )
+                    prefix_video  = preprocess_video(width=width, height=height,video_in=video_source, max_frames= parsed_keep_frames_video_source , start_frame = 0, fit_canvas= sample_fit_canvas, fit_crop = fit_crop, target_fps = fps, block_size = block_size )
                     prefix_video  = prefix_video.permute(3, 0, 1, 2)
                     prefix_video  = prefix_video.float().div_(127.5).sub_(1.) # c, f, h, w
+                    if fit_crop or "L" in image_prompt_type: refresh_preview["video_source"] = convert_tensor_to_image(prefix_video, 0) 
+
                     new_height, new_width = prefix_video.shape[-2:]                    
                     pre_video_guide =  prefix_video[:, -reuse_frames:]
                 pre_video_frame = convert_tensor_to_image(prefix_video[:, -1])
@@ -4752,10 +4797,11 @@ def generate_video(
                 image_end_list=  image_end if isinstance(image_end, list) else [image_end]
                 if len(image_end_list) >= window_no:
                     new_height, new_width = image_size                    
-                    image_end_tensor =image_end_list[window_no-1].resize((new_width, new_height), resample=Image.Resampling.LANCZOS) 
+                    image_end_tensor, _, _ = calculate_dimensions_and_resize_image(image_end_list[window_no-1], new_height, new_width, sample_fit_canvas, fit_crop, block_size = block_size)
+                    # image_end_tensor =image_end_list[window_no-1].resize((new_width, new_height), resample=Image.Resampling.LANCZOS) 
+                    refresh_preview["image_end"] = image_end_tensor 
                     image_end_tensor = convert_image_to_tensor(image_end_tensor)
                 image_end_list= None
-        
             window_start_frame = guide_start_frame - (reuse_frames if window_no > 1 else source_video_overlap_frames_count)
             guide_end_frame = guide_start_frame + current_video_length - (source_video_overlap_frames_count if window_no == 1 else reuse_frames)
             alignment_shift = source_video_frames_count if reset_control_aligment else 0
@@ -4768,7 +4814,8 @@ def generate_video(
                 from models.wan.multitalk.multitalk import get_window_audio_embeddings
                 # special treatment for start frame pos when alignement to first frame requested as otherwise the start frame number will be negative due to overlapped frames (has been previously compensated later with padding)
                 audio_proj_split = get_window_audio_embeddings(audio_proj_full, audio_start_idx= aligned_window_start_frame + (source_video_overlap_frames_count if reset_control_aligment else 0 ), clip_length = current_video_length)
-
+            if vace:
+                video_guide_processed = video_mask_processed = video_guide_processed2 = video_mask_processed2 = None
 
             if video_guide is not None:
                 keep_frames_parsed, error = parse_keep_frames_video_guide(keep_frames_video_guide, source_video_frames_count -source_video_overlap_frames_count + requested_frames_to_generate)
@@ -4776,12 +4823,44 @@ def generate_video(
                     raise gr.Error(f"invalid keep frames {keep_frames_video_guide}")
                 keep_frames_parsed = keep_frames_parsed[aligned_guide_start_frame: aligned_guide_end_frame ]
 
-                if ltxv:
+                if vace:
+                    context_scale = [ control_net_weight]
+                    if "V" in video_prompt_type:
+                        process_outside_mask = process_map_outside_mask.get(filter_letters(video_prompt_type, "YWX"), None)
+                        preprocess_type, preprocess_type2 =  "raw", None 
+                        for process_num, process_letter in enumerate( filter_letters(video_prompt_type, "PDSLCMU")):
+                            if process_num == 0:
+                                preprocess_type = process_map_video_guide.get(process_letter, "raw")
+                            else:
+                                preprocess_type2 = process_map_video_guide.get(process_letter, None)
+                        status_info = "Extracting " + processes_names[preprocess_type]
+                        extra_process_list = ([] if preprocess_type2==None else [preprocess_type2]) + ([] if process_outside_mask==None or process_outside_mask == preprocess_type else [process_outside_mask])
+                        if len(extra_process_list) == 1:
+                            status_info += " and " + processes_names[extra_process_list[0]]
+                        elif len(extra_process_list) == 2:
+                            status_info +=  ", " + processes_names[extra_process_list[0]] + " and " + processes_names[extra_process_list[1]]
+                        if preprocess_type2 is not None:
+                            context_scale = [ control_net_weight /2, control_net_weight2 /2]
+                        send_cmd("progress", [0, get_latest_status(state, status_info)])
+                        video_guide_processed, video_mask_processed = preprocess_video_with_mask(video_guide, video_mask, height=image_size[0], width = image_size[1], max_frames= len(keep_frames_parsed) , start_frame = aligned_guide_start_frame, fit_canvas = sample_fit_canvas, fit_crop = fit_crop, target_fps = fps,  process_type = preprocess_type, expand_scale = mask_expand, RGB_Mask = True, negate_mask = "N" in video_prompt_type, process_outside_mask = process_outside_mask, outpainting_dims = outpainting_dims, proc_no =1 )
+                        if preprocess_type2 != None:
+                            video_guide_processed2, video_mask_processed2 = preprocess_video_with_mask(video_guide, video_mask, height=image_size[0], width = image_size[1], max_frames= len(keep_frames_parsed), start_frame = aligned_guide_start_frame, fit_canvas = sample_fit_canvas, fit_crop = fit_crop, target_fps = fps,  process_type = preprocess_type2, expand_scale = mask_expand, RGB_Mask = True, negate_mask = "N" in video_prompt_type, process_outside_mask = process_outside_mask, outpainting_dims = outpainting_dims, proc_no =2 )
+
+                        if video_guide_processed != None:
+                            if sample_fit_canvas != None:
+                                image_size = video_guide_processed.shape[-3: -1]
+                                sample_fit_canvas = None
+                            refresh_preview["video_guide"] = Image.fromarray(video_guide_processed[0].cpu().numpy())
+                            if video_guide_processed2 != None:
+                                refresh_preview["video_guide"] = [refresh_preview["video_guide"], Image.fromarray(video_guide_processed2[0].cpu().numpy())] 
+                            if video_mask_processed != None:                        
+                                refresh_preview["video_mask"] = Image.fromarray(video_mask_processed[0].cpu().numpy())
+                elif ltxv:
                     preprocess_type = process_map_video_guide.get(filter_letters(video_prompt_type, "PED"), "raw")
                     status_info = "Extracting " + processes_names[preprocess_type]
                     send_cmd("progress", [0, get_latest_status(state, status_info)])
                     # start one frame ealier to facilitate latents merging later
-                    src_video, _ = preprocess_video_with_mask(video_guide, video_mask, height=image_size[0], width = image_size[1], max_frames= len(keep_frames_parsed) + (0 if aligned_guide_start_frame == 0 else 1), start_frame = aligned_guide_start_frame - (0 if aligned_guide_start_frame == 0 else 1), fit_canvas = sample_fit_canvas, target_fps = fps,  process_type = preprocess_type, inpaint_color = 0, proc_no =1, negate_mask = "N" in video_prompt_type, process_outside_mask = "inpaint" if "X" in video_prompt_type else "identity", block_size =block_size )
+                    src_video, _ = preprocess_video_with_mask(video_guide, video_mask, height=image_size[0], width = image_size[1], max_frames= len(keep_frames_parsed) + (0 if aligned_guide_start_frame == 0 else 1), start_frame = aligned_guide_start_frame - (0 if aligned_guide_start_frame == 0 else 1), fit_canvas = sample_fit_canvas, fit_crop = fit_crop, target_fps = fps,  process_type = preprocess_type, inpaint_color = 0, proc_no =1, negate_mask = "N" in video_prompt_type, process_outside_mask = "inpaint" if "X" in video_prompt_type else "identity", block_size =block_size )
                     if src_video !=  None:
                         src_video = src_video[ :(len(src_video)-1)// latent_size * latent_size +1 ]
                         refresh_preview["video_guide"] = Image.fromarray(src_video[0].cpu().numpy())
@@ -4798,15 +4877,14 @@ def generate_video(
                         progress_args = [0, get_latest_status(state,"Extracting Video and Mask")]
 
                     send_cmd("progress", progress_args)
-                    src_video, src_mask = preprocess_video_with_mask(video_guide,  video_mask, height=height, width = width, max_frames= current_video_length if window_no == 1 else current_video_length - reuse_frames, start_frame = guide_start_frame, fit_canvas = sample_fit_canvas, target_fps = fps, process_type= "pose" if "P" in video_prompt_type else "inpaint", negate_mask = "N" in video_prompt_type, inpaint_color =0)
+                    src_video, src_mask = preprocess_video_with_mask(video_guide,  video_mask, height=height, width = width, max_frames= current_video_length if window_no == 1 else current_video_length - reuse_frames, start_frame = guide_start_frame, fit_canvas = sample_fit_canvas, fit_crop = fit_crop, target_fps = fps, process_type= "pose" if "P" in video_prompt_type else "inpaint", negate_mask = "N" in video_prompt_type, inpaint_color =0)
                     refresh_preview["video_guide"] = Image.fromarray(src_video[0].cpu().numpy()) 
                     if src_mask != None:                        
                         refresh_preview["video_mask"] = Image.fromarray(src_mask[0].cpu().numpy())
 
                 elif "R" in video_prompt_type: # sparse video to video
                     src_image = get_video_frame(video_guide, aligned_guide_start_frame, return_last_if_missing = True, return_PIL = True)
-                    new_height, new_width = calculate_new_dimensions(image_size[0], image_size[1], src_image.height, src_image.width, sample_fit_canvas, block_size = block_size)
-                    src_image = src_image.resize((new_width, new_height), resample=Image.Resampling.LANCZOS)
+                    src_image, _, _ = calculate_dimensions_and_resize_image(src_image, new_height, new_width, sample_fit_canvas, fit_crop, block_size = block_size)
                     refresh_preview["video_guide"] = src_image  
                     src_video = convert_image_to_tensor(src_image).unsqueeze(1)
                     if sample_fit_canvas != None:  
@@ -4814,7 +4892,7 @@ def generate_video(
                         sample_fit_canvas = None
 
                 else: # video to video
-                    video_guide_processed = preprocess_video(width = image_size[1], height=image_size[0], video_in=video_guide, max_frames= len(keep_frames_parsed), start_frame = aligned_guide_start_frame, fit_canvas= sample_fit_canvas, target_fps = fps)
+                    video_guide_processed = preprocess_video(width = image_size[1], height=image_size[0], video_in=video_guide, max_frames= len(keep_frames_parsed), start_frame = aligned_guide_start_frame, fit_canvas= sample_fit_canvas, fit_crop = fit_crop, target_fps = fps)
                     if video_guide_processed is None:
                         src_video = pre_video_guide
                     else:
@@ -4824,42 +4902,54 @@ def generate_video(
                         src_video = video_guide_processed.float().div_(127.5).sub_(1.).permute(-1,0,1,2)
                         if pre_video_guide != None:
                             src_video = torch.cat( [pre_video_guide, src_video], dim=1) 
+            elif image_guide is not None:
+                image_guide, new_height, new_width = calculate_dimensions_and_resize_image(image_guide, height, width, sample_fit_canvas, fit_crop, block_size = block_size)
+                image_size = (new_height, new_width)
+                refresh_preview["image_guide"] = image_guide
+                sample_fit_canvas = None
+                if image_mask is not None:
+                    image_mask, _, _ = calculate_dimensions_and_resize_image(image_mask, new_height, new_width, sample_fit_canvas, fit_crop, block_size = block_size)
+                    refresh_preview["image_mask"] = image_mask
+
+            if window_no == 1 and image_refs is not None and len(image_refs) > 0:
+                if sample_fit_canvas is not None and (nb_frames_positions > 0 or "K" in video_prompt_type) :
+                    from shared.utils.utils import get_outpainting_full_area_dimensions
+                    w, h = image_refs[0].size
+                    if outpainting_dims != None:
+                        h, w = get_outpainting_full_area_dimensions(h,w, outpainting_dims)
+                    image_size = calculate_new_dimensions(height, width, h, w, fit_canvas)
+                sample_fit_canvas = None
+                if repeat_no == 1:
+                    if fit_crop:
+                        if any_background_ref == 2:
+                            end_ref_position = len(image_refs)
+                        elif any_background_ref == 1:
+                            end_ref_position = nb_frames_positions + 1
+                        else:
+                            end_ref_position = nb_frames_positions 
+                        for i, img in enumerate(image_refs[:end_ref_position]):
+                            image_refs[i] = rescale_and_crop(img, default_image_size[1], default_image_size[0])
+                        refresh_preview["image_refs"] = image_refs
+
+                    if len(image_refs) > nb_frames_positions:  
+                        if remove_background_images_ref > 0:
+                            send_cmd("progress", [0, get_latest_status(state, "Removing Images References Background")])
+                        # keep image ratios if there is a background image ref (we will let the model preprocessor decide what to do) but remove bg if requested
+                        image_refs[nb_frames_positions:]  = resize_and_remove_background(image_refs[nb_frames_positions:] , image_size[1], image_size[0],
+                                                                                        remove_background_images_ref > 0, any_background_ref, 
+                                                                                        fit_into_canvas= 0 if (any_background_ref > 0 or model_def.get("lock_image_refs_ratios", False)) else 1,
+                                                                                        block_size=block_size,
+                                                                                        outpainting_dims =outpainting_dims )
+                        refresh_preview["image_refs"] = image_refs
+
+                    if nb_frames_positions > 0:
+                        frames_to_inject = [None] * (max(frames_positions_list) + 1)
+                        for i, pos in enumerate(frames_positions_list):
+                            frames_to_inject[pos] = image_refs[i] 
 
             if vace :
-                image_refs_copy = image_refs[nb_frames_positions:].copy() if image_refs != None and len(image_refs) > nb_frames_positions else None # required since prepare_source do inplace modifications
-                context_scale = [ control_net_weight]
-                video_guide_processed = video_mask_processed = video_guide_processed2 = video_mask_processed2 = None
-                if "V" in video_prompt_type:
-                    process_outside_mask = process_map_outside_mask.get(filter_letters(video_prompt_type, "YWX"), None)
-                    preprocess_type, preprocess_type2 =  "raw", None 
-                    for process_num, process_letter in enumerate( filter_letters(video_prompt_type, "PDSLCMU")):
-                        if process_num == 0:
-                            preprocess_type = process_map_video_guide.get(process_letter, "raw")
-                        else:
-                            preprocess_type2 = process_map_video_guide.get(process_letter, None)
-                    status_info = "Extracting " + processes_names[preprocess_type]
-                    extra_process_list = ([] if preprocess_type2==None else [preprocess_type2]) + ([] if process_outside_mask==None or process_outside_mask == preprocess_type else [process_outside_mask])
-                    if len(extra_process_list) == 1:
-                        status_info += " and " + processes_names[extra_process_list[0]]
-                    elif len(extra_process_list) == 2:
-                        status_info +=  ", " + processes_names[extra_process_list[0]] + " and " + processes_names[extra_process_list[1]]
-                    if preprocess_type2 is not None:
-                        context_scale = [ control_net_weight /2, control_net_weight2 /2]
-                    send_cmd("progress", [0, get_latest_status(state, status_info)])
-                    video_guide_processed, video_mask_processed = preprocess_video_with_mask(video_guide, video_mask, height=image_size[0], width = image_size[1], max_frames= len(keep_frames_parsed) , start_frame = aligned_guide_start_frame, fit_canvas = sample_fit_canvas, target_fps = fps,  process_type = preprocess_type, expand_scale = mask_expand, RGB_Mask = True, negate_mask = "N" in video_prompt_type, process_outside_mask = process_outside_mask, outpainting_dims = outpainting_dims, proc_no =1 )
-                    if preprocess_type2 != None:
-                        video_guide_processed2, video_mask_processed2 = preprocess_video_with_mask(video_guide, video_mask, height=image_size[0], width = image_size[1], max_frames= len(keep_frames_parsed), start_frame = aligned_guide_start_frame, fit_canvas = sample_fit_canvas, target_fps = fps,  process_type = preprocess_type2, expand_scale = mask_expand, RGB_Mask = True, negate_mask = "N" in video_prompt_type, process_outside_mask = process_outside_mask, outpainting_dims = outpainting_dims, proc_no =2 )
-
-                    if video_guide_processed != None:
-                        if sample_fit_canvas != None:
-                            image_size = video_guide_processed.shape[-3: -1]
-                            sample_fit_canvas = None
-                        refresh_preview["video_guide"] = Image.fromarray(video_guide_processed[0].cpu().numpy())
-                        if video_guide_processed2 != None:
-                            refresh_preview["video_guide"] = [refresh_preview["video_guide"], Image.fromarray(video_guide_processed2[0].cpu().numpy())] 
-                        if video_mask_processed != None:                        
-                            refresh_preview["video_mask"] = Image.fromarray(video_mask_processed[0].cpu().numpy())
                 frames_to_inject_parsed = frames_to_inject[aligned_guide_start_frame: aligned_guide_end_frame]
+                image_refs_copy = image_refs[nb_frames_positions:].copy() if image_refs != None and len(image_refs) > nb_frames_positions else None # required since prepare_source do inplace modifications
 
                 src_video, src_mask, src_ref_images = wan_model.prepare_source([video_guide_processed] if video_guide_processed2 == None else [video_guide_processed, video_guide_processed2],
                                                                         [video_mask_processed] if video_guide_processed2 == None else [video_mask_processed, video_mask_processed2],
@@ -4868,7 +4958,6 @@ def generate_video(
                                                                         keep_video_guide_frames=keep_frames_parsed,
                                                                         start_frame = aligned_guide_start_frame,
                                                                         pre_src_video = [pre_video_guide] if video_guide_processed2 == None else [pre_video_guide, pre_video_guide],
-                                                                        fit_into_canvas = sample_fit_canvas,
                                                                         inject_frames= frames_to_inject_parsed,
                                                                         outpainting_dims = outpainting_dims,
                                                                         any_background_ref = any_background_ref
@@ -4931,9 +5020,9 @@ def generate_video(
                     prefix_frames_count = source_video_overlap_frames_count if window_no <= 1 else reuse_frames,
                     frame_num= (current_video_length // latent_size)* latent_size + 1,
                     batch_size = batch_size,
-                    height =  height,
-                    width = width,
-                    fit_into_canvas = fit_canvas == 1,
+                    height = image_size[0],
+                    width = image_size[1],
+                    fit_into_canvas = fit_canvas,
                     shift=flow_shift,
                     sample_solver=sample_solver,
                     sampling_steps=num_inference_steps,
@@ -4990,6 +5079,8 @@ def generate_video(
                     pre_video_frame = pre_video_frame,
                     original_input_ref_images = original_image_refs[nb_frames_positions:] if original_image_refs is not None else [],
                     image_refs_relative_size = image_refs_relative_size,
+                    image_guide= image_guide,
+                    image_mask= image_mask,
                 )
             except Exception as e:
                 if len(control_audio_tracks) > 0 or len(source_audio_tracks) > 0:
@@ -5931,7 +6022,7 @@ def prepare_inputs_dict(target, inputs, model_type = None, model_filename = None
     if target == "settings":
         return inputs
 
-    image_outputs = inputs.get("image_mode",0) == 1
+    image_outputs = inputs.get("image_mode",0) > 0
 
     pop=[]    
     if "force_fps" in inputs and len(inputs["force_fps"])== 0:
@@ -5947,13 +6038,13 @@ def prepare_inputs_dict(target, inputs, model_type = None, model_filename = None
         pop += ["MMAudio_setting", "MMAudio_prompt", "MMAudio_neg_prompt"]
 
     video_prompt_type = inputs["video_prompt_type"]
-    if not base_model_type in ["t2v"]:
+    if not "G" in video_prompt_type:
         pop += ["denoising_strength"]
 
     if not (server_config.get("enhancer_enabled", 0) > 0 and server_config.get("enhancer_mode", 0) == 0):
         pop += ["prompt_enhancer"]
 
-    if not recammaster and not diffusion_forcing and not flux:
+    if model_def.get("model_modes", None) is None:
         pop += ["model_mode"]
 
     if not vace and not phantom and not hunyuan_video_custom:
@@ -6075,6 +6166,18 @@ def image_to_ref_image_set(state, input_file_list, choice, target, target_name):
     gr.Info(f"Selected Image was copied to {target_name}")
     return file_list[choice]
 
+def image_to_ref_image_guide(state, input_file_list, choice):
+    file_list, file_settings_list = get_file_list(state, input_file_list)
+    if len(file_list) == 0 or choice == None or choice < 0 or choice > len(file_list): return gr.update(), gr.update()
+    ui_settings = get_current_model_settings(state)
+    gr.Info(f"Selected Image was copied to Control Image")
+    new_image = file_list[choice]
+    if ui_settings["image_mode"]==2:
+        return new_image, new_image
+    else:
+        return new_image, None
+
+
 
 def apply_post_processing(state, input_file_list, choice, PP_temporal_upsampling, PP_spatial_upsampling, PP_film_grain_intensity, PP_film_grain_saturation):
     gen = get_gen_info(state)
@@ -6142,11 +6245,11 @@ def eject_video_from_gallery(state, input_file_list, choice):
     return gr.Gallery(value = file_list, selected_index= choice), gr.update() if len(file_list) >0 else get_default_video_info(), gr.Row(visible= len(file_list) > 0)
 
 def has_video_file_extension(filename):
-    extension = os.path.splitext(filename)[-1]
+    extension = os.path.splitext(filename)[-1].lower()
     return extension in [".mp4"]
 
 def has_image_file_extension(filename):
-    extension = os.path.splitext(filename)[-1]
+    extension = os.path.splitext(filename)[-1].lower()
     return extension in [".png", ".jpg", ".jpeg", ".bmp", ".gif", ".webp", ".tif", ".tiff", ".jfif", ".pjpeg"]
 def add_videos_to_gallery(state, input_file_list, choice, files_to_load):
     gen = get_gen_info(state)
@@ -6308,7 +6411,7 @@ def get_settings_from_file(state, file_path, allow_json, merge_with_defaults, sw
     return configs, any_image_or_video
 
 def record_image_mode_tab(state, evt:gr.SelectData):
-    state["image_mode_tab"] = 0 if evt.index ==0 else 1
+    state["image_mode_tab"] = evt.index
 
 def switch_image_mode(state):
     image_mode = state.get("image_mode_tab", 0)
@@ -6316,7 +6419,18 @@ def switch_image_mode(state):
     ui_defaults = get_model_settings(state, model_type)        
 
     ui_defaults["image_mode"] = image_mode
-
+    video_prompt_type = ui_defaults.get("video_prompt_type", "") 
+    model_def = get_model_def( model_type)
+    inpaint_support = model_def.get("inpaint_support", False)
+    if inpaint_support:
+        if image_mode == 1:
+            video_prompt_type = del_in_sequence(video_prompt_type, "VAG")  
+            video_prompt_type = add_to_sequence(video_prompt_type, "KI")
+        elif image_mode == 2:
+            video_prompt_type = add_to_sequence(video_prompt_type, "VAG")  
+            video_prompt_type = del_in_sequence(video_prompt_type, "KI")
+        ui_defaults["video_prompt_type"] = video_prompt_type 
+        
     return  str(time.time())
 
 def load_settings_from_file(state, file_path):
@@ -6349,6 +6463,7 @@ def load_settings_from_file(state, file_path):
 
 def save_inputs(
             target,
+            image_mask_guide,
             lset_name,
             image_mode,
             prompt,
@@ -6434,13 +6549,18 @@ def save_inputs(
             state,
 ):
 
-  
-    # if state.get("validate_success",0) != 1:
-    #     return
+
     model_filename = state["model_filename"]
     model_type = state["model_type"]
+    if image_mask_guide is not None and image_mode == 2:
+        if "background" in image_mask_guide: 
+            image_guide = image_mask_guide["background"]
+        if "layers" in image_mask_guide and len(image_mask_guide["layers"])>0: 
+            image_mask = image_mask_guide["layers"][0] 
+        image_mask_guide = None
     inputs = get_function_arguments(save_inputs, locals())
     inputs.pop("target")
+    inputs.pop("image_mask_guide")
     cleaned_inputs = prepare_inputs_dict(target, inputs)
     if target == "settings":
         defaults_filename = get_settings_file_name(model_type)
@@ -6544,11 +6664,16 @@ def change_model(state, model_choice):
     
     return header
 
-def fill_inputs(state):
+def get_current_model_settings(state):
     model_type = state["model_type"]
-    ui_defaults = get_model_settings(state, model_type)        
+    ui_defaults = get_model_settings(state, model_type)
     if ui_defaults == None:
         ui_defaults = get_default_settings(model_type)
+        set_model_settings(state, model_type, ui_defaults)
+    return ui_defaults 
+
+def fill_inputs(state):
+    ui_defaults = get_current_model_settings(state)
  
     return generate_video_tab(update_form = True, state_dict = state, ui_defaults = ui_defaults)
 
@@ -6623,7 +6748,9 @@ def refresh_image_prompt_type_radio(state, image_prompt_type, image_prompt_type_
     image_prompt_type = del_in_sequence(image_prompt_type, "VLTS")
     image_prompt_type = add_to_sequence(image_prompt_type, image_prompt_type_radio)
     any_video_source = len(filter_letters(image_prompt_type, "VL"))>0
-    end_visible = any_letters(image_prompt_type, "SVL")
+    model_def = get_model_def(state["model_type"])
+    image_prompt_types_allowed = model_def.get("image_prompt_types_allowed", "")
+    end_visible = "E" in image_prompt_types_allowed and any_letters(image_prompt_type, "SVL")
     return image_prompt_type, gr.update(visible = "S" in image_prompt_type ), gr.update(visible = end_visible and ("E" in image_prompt_type) ), gr.update(visible = "V" in image_prompt_type) , gr.update(visible = any_video_source), gr.update(visible = end_visible)
 
 def refresh_image_prompt_type_endcheckbox(state, image_prompt_type, image_prompt_type_radio, end_checkbox):
@@ -6654,7 +6781,7 @@ def refresh_video_prompt_type_video_mask(state, video_prompt_type, video_prompt_
     visible= "A" in video_prompt_type     
     model_type = state["model_type"]
     model_def = get_model_def(model_type)
-    image_outputs =  image_mode == 1
+    image_outputs =  image_mode > 0
     return video_prompt_type, gr.update(visible= visible and not image_outputs), gr.update(visible= visible and image_outputs), gr.update(visible= visible )
 
 def refresh_video_prompt_type_alignment(state, video_prompt_type, video_prompt_type_video_guide):
@@ -6663,20 +6790,22 @@ def refresh_video_prompt_type_alignment(state, video_prompt_type, video_prompt_t
     return video_prompt_type
 
 def refresh_video_prompt_type_video_guide(state, video_prompt_type, video_prompt_type_video_guide,  image_mode):
-    video_prompt_type = del_in_sequence(video_prompt_type, "PDESLCMGUV")
+    video_prompt_type = del_in_sequence(video_prompt_type, "PDESLCMUV")
     video_prompt_type = add_to_sequence(video_prompt_type, video_prompt_type_video_guide)
     visible = "V" in video_prompt_type
     model_type = state["model_type"]
     base_model_type = get_base_model_type(model_type)
     mask_visible = visible and "A" in video_prompt_type and not "U" in video_prompt_type
     model_def = get_model_def(model_type)
-    image_outputs =  image_mode == 1
+    image_outputs =  image_mode > 0
     vace= test_vace_module(model_type)
     keep_frames_video_guide_visible = not image_outputs and visible and not model_def.get("keep_frames_video_guide_not_supported", False)
     return video_prompt_type,  gr.update(visible = visible and not image_outputs), gr.update(visible = visible and image_outputs), gr.update(visible = keep_frames_video_guide_visible), gr.update(visible = visible and "G" in video_prompt_type), gr.update(visible= (visible or "F" in video_prompt_type or "K" in video_prompt_type) and vace), gr.update(visible= visible and not "U" in video_prompt_type ), gr.update(visible= mask_visible and not image_outputs), gr.update(visible= mask_visible and image_outputs), gr.update(visible= mask_visible)
 
 def refresh_video_prompt_type_video_guide_alt(state, video_prompt_type, video_prompt_type_video_guide_alt):
-    video_prompt_type = del_in_sequence(video_prompt_type, "RGUVQKI")
+    model_def = get_model_def(state["model_type"])
+    guide_custom_choices = model_def.get("guide_custom_choices",{})
+    video_prompt_type = del_in_sequence(video_prompt_type, guide_custom_choices.get("letters_filter",""))
     video_prompt_type = add_to_sequence(video_prompt_type, video_prompt_type_video_guide_alt)
     control_video_visible = "V" in video_prompt_type
     ref_images_visible = "I" in video_prompt_type
@@ -6711,7 +6840,7 @@ def get_image_end_label(multi_prompts_gen_type):
     return "Images as ending points for new Videos in the Generation Queue" if multi_prompts_gen_type == 0 else "Images as ending points for each new Window of the same Video Generation" 
 
 def refresh_prompt_labels(multi_prompts_gen_type, image_mode):
-    prompt_label, wizard_prompt_label =  get_prompt_labels(multi_prompts_gen_type, image_mode == 1)
+    prompt_label, wizard_prompt_label =  get_prompt_labels(multi_prompts_gen_type, image_mode > 0)
     return gr.update(label=prompt_label), gr.update(label = wizard_prompt_label), gr.update(label=get_image_end_label(multi_prompts_gen_type))
 
 def show_preview_column_modal(state, column_no):
@@ -7032,7 +7161,6 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non
             diffusion_forcing = "diffusion_forcing" in model_filename 
             ltxv = "ltxv" in model_filename 
             lock_inference_steps = model_def.get("lock_inference_steps", False)
-            model_reference_image = model_def.get("reference_image", False)
             any_tea_cache = model_def.get("tea_cache", False)
             any_mag_cache = model_def.get("mag_cache", False)
             recammaster = base_model_type in ["recam_1.3B"]
@@ -7075,18 +7203,22 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non
             if not v2i_switch_supported and not image_outputs:
                 image_mode_value = 0
             else:
-                image_outputs = image_mode_value == 1
+                image_outputs = image_mode_value > 0 
+            inpaint_support = model_def.get("inpaint_support", False)
             image_mode = gr.Number(value =image_mode_value, visible = False)
-
-            with gr.Tabs(visible = v2i_switch_supported, selected= "t2i" if image_mode_value == 1 else "t2v" ) as image_mode_tabs:
-                with gr.Tab("Text to Video", id = "t2v", elem_classes="compact_tab"):
+            image_mode_tab_selected= "t2i" if image_mode_value == 1 else ("inpaint" if image_mode_value == 2 else "t2v") 
+            with gr.Tabs(visible = v2i_switch_supported or inpaint_support, selected= image_mode_tab_selected ) as image_mode_tabs:
+                with gr.Tab("Text to Video", id = "t2v", elem_classes="compact_tab", visible = v2i_switch_supported) as tab_t2v:
                     pass
                 with gr.Tab("Text to Image", id = "t2i", elem_classes="compact_tab"):
                     pass
+                with gr.Tab("Image Inpainting", id = "inpaint", elem_classes="compact_tab", visible=inpaint_support) as tab_inpaint:
+                    pass
 
             image_prompt_types_allowed = model_def.get("image_prompt_types_allowed", "")
             model_mode_choices = model_def.get("model_modes", None)
             with gr.Column(visible= len(image_prompt_types_allowed)> 0 or model_mode_choices is not None) as image_prompt_column: 
+                # Video Continue /  Start Frame / End Frame
                 image_prompt_type_value= ui_defaults.get("image_prompt_type","")
                 image_prompt_type = gr.Text(value= image_prompt_type_value, visible= False)
                 image_prompt_type_choices = []
@@ -7123,192 +7255,167 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non
                     model_mode = gr.Dropdown(choices=model_mode_choices["choices"], value=ui_defaults.get("model_mode", model_mode_choices["default"]), label=model_mode_choices["label"],  visible=True)                        
                 keep_frames_video_source = gr.Text(value=ui_defaults.get("keep_frames_video_source","") , visible= len(filter_letters(image_prompt_type_value, "VL"))>0 , scale = 2, label= "Truncate Video beyond this number of resampled Frames (empty=Keep All, negative truncates from End)" ) 
 
-            with gr.Column(visible= vace or phantom or hunyuan_video_custom or hunyuan_video_avatar or hunyuan_video_custom_edit or t2v or standin or ltxv or infinitetalk or recammaster or flux and model_reference_image or qwen and model_reference_image) as video_prompt_column: 
+            any_control_video = any_control_image = False
+            guide_preprocessing = model_def.get("guide_preprocessing", None)
+            mask_preprocessing = model_def.get("mask_preprocessing", None)
+            guide_custom_choices = model_def.get("guide_custom_choices", None)
+            image_ref_choices = model_def.get("image_ref_choices", None)
+
+            # with gr.Column(visible= vace or phantom or hunyuan_video_custom or hunyuan_video_avatar or hunyuan_video_custom_edit or t2v or standin or ltxv or infinitetalk or recammaster or (flux or qwen ) and model_reference_image and image_mode_value >=1) as video_prompt_column: 
+            with gr.Column(visible= guide_preprocessing is not None or mask_preprocessing is not None or guide_custom_choices is not None or image_ref_choices is not None) as video_prompt_column: 
                 video_prompt_type_value= ui_defaults.get("video_prompt_type","")
                 video_prompt_type = gr.Text(value= video_prompt_type_value, visible= False)
-                any_control_video = True
-                any_control_image = image_outputs 
-                with gr.Row():
-                    if t2v:
-                        video_prompt_type_video_guide = gr.Dropdown(
-                            choices=[
-                                ("Use Text Prompt Only", ""),
-                                ("Image to Image guided by Text Prompt" if image_outputs else "Video to Video guided by Text Prompt", "GUV"),
-                           ],
-                            value=filter_letters(video_prompt_type_value, "GUV"),
-                            label="Video to Video", scale = 2, show_label= False, visible= True
-                        )
-                    elif vace :
+                with gr.Row(visible = image_mode_value!=2) as guide_selection_row:
+                    # Control Video Preprocessing
+                    if guide_preprocessing is None:
+                        video_prompt_type_video_guide = gr.Dropdown(choices=[("","")], value="", label="Control Video", scale = 2, visible= False, show_label= True, )
+                    else:
                         pose_label = "Pose" if image_outputs else "Motion" 
-                        video_prompt_type_video_guide = gr.Dropdown(
-                            choices=[
-                                ("No Control Image" if image_outputs else "No Control Video", ""),
-                                ("Keep Control Image Unchanged" if image_outputs else "Keep Control Video Unchanged", "UV"),
-                                (f"Transfer Human {pose_label}" , "PV"),
-                                ("Transfer Depth", "DV"),
-                                ("Transfer Shapes", "SV"),
-                                ("Transfer Flow", "LV"),
-                                ("Recolorize", "CV"),
-                                ("Perform Inpainting", "MV"),
-                                ("Use Vace raw format", "V"),
-                                (f"Transfer Human {pose_label} & Depth", "PDV"),
-                                (f"Transfer Human {pose_label} & Shapes", "PSV"),
-                                (f"Transfer Human {pose_label} & Flow", "PLV"),
-                                ("Transfer Depth & Shapes", "DSV"),
-                                ("Transfer Depth & Flow", "DLV"),
-                                ("Transfer Shapes & Flow", "SLV"),
-                           ],
-                            value=filter_letters(video_prompt_type_value, "PDSLCMGUV"),
-                            label="Control Image Process" if image_outputs else "Control Video Process", scale = 2, visible= True, show_label= True,
-                        )
-                    elif ltxv:
-                        video_prompt_type_video_guide = gr.Dropdown(
-                            choices=[
-                                ("No Control Video", ""),
-                                ("Transfer Human Motion", "PV"),
-                                ("Transfer Depth", "DV"),
-                                ("Transfer Canny Edges", "EV"),
-                                ("Use LTXV raw format", "V"),
-                           ],
-                            value=filter_letters(video_prompt_type_value, "PDEV"),
-                            label="Control Video Process", scale = 2, visible= True, show_label= True,
-                        )
+                        guide_preprocessing_labels_all = {
+                            "": "No Control Video",
+                            "UV": "Keep Control Video Unchanged",
+                            "PV": f"Transfer Human {pose_label}",
+                            "PMV": f"Transfer Human {pose_label}",
+                            "DV": "Transfer Depth",
+                            "EV": "Transfer Canny Edges",
+                            "SV": "Transfer Shapes",
+                            "LV": "Transfer Flow",
+                            "CV": "Recolorize",
+                            "MV": "Perform Inpainting",
+                            "V": "Use Vace raw format",
+                            "PDV": f"Transfer Human {pose_label} & Depth",
+                            "PSV": f"Transfer Human {pose_label} & Shapes",
+                            "PLV": f"Transfer Human {pose_label} & Flow" ,
+                            "DSV": "Transfer Depth & Shapes",
+                            "DLV": "Transfer Depth & Flow",
+                            "SLV": "Transfer Shapes & Flow",
+                        }
+                        guide_preprocessing_choices = []
+                        guide_preprocessing_labels = guide_preprocessing.get("labels", {}) 
+                        for process_type in guide_preprocessing["selection"]:
+                            process_label = guide_preprocessing_labels.get(process_type, None)
+                            process_label = guide_preprocessing_labels_all.get(process_type,process_type) if process_label is None else process_label
+                            if image_outputs: process_label = process_label.replace("Video", "Image")
+                            guide_preprocessing_choices.append( (process_label, process_type) )
 
-                    elif hunyuan_video_custom_edit:
+                        video_prompt_type_video_guide_label = guide_preprocessing.get("label", "Control Video Process")
+                        if image_outputs: video_prompt_type_video_guide_label = video_prompt_type_video_guide_label.replace("Video", "Image")
                         video_prompt_type_video_guide = gr.Dropdown(
-                            choices=[
-                                ("Inpaint Control Image" if image_outputs else "Inpaint Control Video", "MV"),
-                                ("Transfer Human Motion", "PMV"),
-                            ],
-                            value=filter_letters(video_prompt_type_value, "PDSLCMUV"),
-                            label="Image to Image" if image_outputs else "Video to Video", scale = 3, visible= True, show_label= True,
-                        ) 
-                    elif recammaster:
-                        video_prompt_type_video_guide = gr.Dropdown(value="UV", choices = [("Control Video","UV")], visible=False)
+                            guide_preprocessing_choices,
+                            value=filter_letters(video_prompt_type_value, "PDESLCMUV", guide_preprocessing.get("default", "") ),
+                            label= video_prompt_type_video_guide_label , scale = 2, visible= guide_preprocessing.get("visible", True) , show_label= True,
+                        )
+                        any_control_video = True
+                        any_control_image = image_outputs 
+
+                    # Alternate Control Video Preprocessing / Options
+                    if guide_custom_choices is None:
+                        video_prompt_type_video_guide_alt = gr.Dropdown(choices=[("","")], value="", label="Control Video", visible= False, scale = 2 )
                     else:
-                        any_control_video = False
-                        any_control_image = False
-                        video_prompt_type_video_guide = gr.Dropdown(value="", choices = [("","")], visible=False)
-
-                    if infinitetalk:
+                        video_prompt_type_video_guide_alt_label = guide_custom_choices.get("label", "Control Video Process")
+                        if image_outputs: video_prompt_type_video_guide_alt_label = video_prompt_type_video_guide_alt_label.replace("Video", "Image")
+                        video_prompt_type_video_guide_alt_choices = [(label.replace("Video", "Image") if image_outputs else label, value) for label,value in guide_custom_choices["choices"] ]
                         video_prompt_type_video_guide_alt = gr.Dropdown(
-                            choices=[
-                                ("Images to Video, each Reference Image will start a new shot with a new Sliding Window - Sharp Transitions", "QKI"),
-                                ("Images to Video, each Reference Image will start a new shot with a new Sliding Window - Smooth Transitions", "KI"),
-                                ("Sparse Video to Video, one Image will by extracted from Video for each new Sliding Window - Sharp Transitions", "QRUV"),
-                                ("Sparse Video to Video, one Image will by extracted from Video for each new Sliding Window - Smooth Transitions", "RUV"),
-                                ("Video to Video, amount of motion transferred depends on Denoising Strength - Sharp Transitions", "GQUV"),
-                                ("Video to Video, amount of motion transferred depends on Denoising Strength - Smooth Transitions", "GUV"),
-                            ],
-                            value=filter_letters(video_prompt_type_value, "RGUVQKI"),
-                            label="Video to Video", scale = 3, visible= True, show_label= False,
-                        ) 
-                        any_control_video = any_control_image = True
-                    else:
-                        video_prompt_type_video_guide_alt = gr.Dropdown(value="", choices = [("","")], visible=False)
+                            choices= video_prompt_type_video_guide_alt_choices,
+                            value=filter_letters(video_prompt_type_value, guide_custom_choices["letters_filter"], guide_custom_choices.get("default", "") ),
+                            visible = guide_custom_choices.get("visible", True),
+                            label= video_prompt_type_video_guide_alt_label, show_label= guide_custom_choices.get("show_label", True), scale = 2
+                        )
+                        any_control_video = True
+                        any_control_image = image_outputs 
 
-                    # video_prompt_video_guide_trigger = gr.Text(visible=False, value="")
-                    if t2v:
-                        video_prompt_type_video_mask = gr.Dropdown(value = "", choices = [""], visible = False)
-                    elif hunyuan_video_custom_edit:
-                        video_prompt_type_video_mask = gr.Dropdown(
-                            choices=[
-                                ("Masked Area", "A"),
-                                ("Non Masked Area", "NA"),
-                            ],
-                            value= filter_letters(video_prompt_type_value, "NA"),
-                            visible= "V" in video_prompt_type_value,
-                            label="Area Processed", scale = 2, show_label= True,
-                        )
-                    elif ltxv:
-                        video_prompt_type_video_mask = gr.Dropdown(
-                            choices=[
-                                ("Whole Frame", ""),
-                                ("Masked Area", "A"),
-                                ("Non Masked Area", "NA"),
-                                ("Masked Area, rest Inpainted", "XA"),
-                                ("Non Masked Area, rest Inpainted", "XNA"),
-                            ],
-                            value= filter_letters(video_prompt_type_value, "XNA"),
-                            visible=  "V" in video_prompt_type_value and not "U" in video_prompt_type_value,
-                            label="Area Processed", scale = 2, show_label= True,
-                        )
+                    # Control Mask Preprocessing
+                    if mask_preprocessing is None:
+                        video_prompt_type_video_mask = gr.Dropdown(choices=[("","")], value="", label="Video Mask", scale = 2, visible= False, show_label= True, )
                     else:
+                        mask_preprocessing_labels_all = {
+                            "": "Whole Frame",
+                            "A": "Masked Area",
+                            "NA": "Non Masked Area",
+                            "XA": "Masked Area, rest Inpainted",
+                            "XNA": "Non Masked Area, rest Inpainted", 
+                            "YA": "Masked Area, rest Depth",
+                            "YNA": "Non Masked Area, rest Depth",
+                            "WA": "Masked Area, rest Shapes",
+                            "WNA": "Non Masked Area, rest Shapes",
+                            "ZA": "Masked Area, rest Flow",
+                            "ZNA": "Non Masked Area, rest Flow"
+                        }
+
+                        mask_preprocessing_choices = []
+                        mask_preprocessing_labels = guide_preprocessing.get("labels", {}) 
+                        for process_type in mask_preprocessing["selection"]:
+                            process_label = mask_preprocessing_labels.get(process_type, None)
+                            process_label = mask_preprocessing_labels_all.get(process_type, process_type) if process_label is None else process_label
+                            mask_preprocessing_choices.append( (process_label, process_type) )
+
+                        video_prompt_type_video_mask_label = guide_preprocessing.get("label", "Area Processed")
                         video_prompt_type_video_mask = gr.Dropdown(
-                            choices=[
-                                ("Whole Frame", ""),
-                                ("Masked Area", "A"),
-                                ("Non Masked Area", "NA"),
-                                ("Masked Area, rest Inpainted", "XA"),
-                                ("Non Masked Area, rest Inpainted", "XNA"),
-                                ("Masked Area, rest Depth", "YA"),
-                                ("Non Masked Area, rest Depth", "YNA"),
-                                ("Masked Area, rest Shapes", "WA"),
-                                ("Non Masked Area, rest Shapes", "WNA"),
-                                ("Masked Area, rest Flow", "ZA"),
-                                ("Non Masked Area, rest Flow", "ZNA"),
-                            ],
-                            value= filter_letters(video_prompt_type_value, "XYZWNA"),
-                            visible=  "V" in video_prompt_type_value and not "U" in video_prompt_type_value and not hunyuan_video_custom and not ltxv,
-                            label="Area Processed", scale = 2, show_label= True,
-                        )
-                    image_ref_choices = model_def.get("image_ref_choices", None)
-                    if image_ref_choices is not None:
-                        video_prompt_type_image_refs = gr.Dropdown(
-                            choices= image_ref_choices["choices"],
-                            value=filter_letters(video_prompt_type_value, image_ref_choices["letters_filter"]),
-                            visible = True,
-                            label=image_ref_choices["label"], show_label= True, scale = 2
-                        )
-                    elif t2v:
-                        video_prompt_type_image_refs = gr.Dropdown(value="", label="Ref Image", choices=[""], visible =False)
-                    elif vace:
-                        video_prompt_type_image_refs = gr.Dropdown(
-                            choices=[
-                                ("None", ""),
-                                ("Inject only People / Objects", "I"),
-                                ("Inject Landscape and then People / Objects", "KI"),
-                                ("Inject Frames and then People / Objects", "FI"),
-                                ],
-                            value=filter_letters(video_prompt_type_value, "KFI"),
-                            visible = True,
-                            label="Reference Images", show_label= True, scale = 2
-                        )
-                    elif standin: # and not vace
-                        video_prompt_type_image_refs = gr.Dropdown(
-                            choices=[
-                                ("No Reference Image", ""),
-                                ("Reference Image is a Person Face", "I"),
-                                ],
-                            value=filter_letters(video_prompt_type_value, "I"),
-                            visible = True,
-                            show_label=False,
-                            label="Reference Image", scale = 2
+                            mask_preprocessing_choices,
+                            value=filter_letters(video_prompt_type_value, "XYZWNA", mask_preprocessing.get("default", "")),
+                            label= video_prompt_type_video_mask_label , scale = 2, visible= "V" in video_prompt_type_value and not "U" in video_prompt_type_value and mask_preprocessing.get("visible", True), 
+                            show_label= True,
                         )
 
-                    elif (flux or qwen) and model_reference_image:
+
+                    # Image Refs Selection
+                    if image_ref_choices is None:
                         video_prompt_type_image_refs = gr.Dropdown(
-                            choices=[
-                                ("None", ""),
-                                ("Conditional Images is first Main Subject / Landscape and may be followed by People / Objects", "KI"),
-                                ("Conditional Images are People / Objects", "I"),
-                                ],
-                            value=filter_letters(video_prompt_type_value, "KI"),
-                            visible = True,
-                            show_label=False,
-                            label="Reference Images Combination Method", scale = 2
-                        )
-                    else:
-                        video_prompt_type_image_refs = gr.Dropdown(
-                            choices=[ ("None", ""),("Start", "KI"),("Ref Image", "I")],
-                            value=filter_letters(video_prompt_type_value, "KI"),
+                            # choices=[ ("None", ""),("Start", "KI"),("Ref Image", "I")],
+                            choices=[ ("None", ""),],
+                            value=filter_letters(video_prompt_type_value, ""),
                             visible = False,
                             label="Start / Reference Images", scale = 2
                         )
-                image_guide = gr.Image(label= "Control Image", height = gallery_height, type ="pil", visible= image_outputs and "V" in video_prompt_type_value, value= ui_defaults.get("image_guide", None))
-                video_guide = gr.Video(label= "Control Video", height = gallery_height, visible= (not image_outputs) and "V" in video_prompt_type_value, value= ui_defaults.get("video_guide", None))
+                        any_reference_image = False
+                    else:
+                        any_reference_image = True
+                        video_prompt_type_image_refs = gr.Dropdown(
+                            choices= image_ref_choices["choices"],
+                            value=filter_letters(video_prompt_type_value, image_ref_choices["letters_filter"]),
+                            visible = image_ref_choices.get("visible", True),
+                            label=image_ref_choices.get("label", "Ref. Images Type"), show_label= True, scale = 2
+                        )
 
-                denoising_strength = gr.Slider(0, 1, value= ui_defaults.get("denoising_strength" ,0.5), step=0.01, label="Denoising Strength (the Lower the Closer to the Control Video)", visible = "G" in video_prompt_type_value, show_reset_button= False)
+                image_guide = gr.Image(label= "Control Image", height = gallery_height, type ="pil", visible= image_mode_value==1 and "V" in video_prompt_type_value, value= ui_defaults.get("image_guide", None))
+                video_guide = gr.Video(label= "Control Video", height = gallery_height, visible= (not image_outputs) and "V" in video_prompt_type_value, value= ui_defaults.get("video_guide", None))
+                if image_mode_value == 2 and inpaint_support:
+                    image_guide_value = ui_defaults.get("image_guide", None)
+                    image_mask_value = ui_defaults.get("image_mask", None)
+                    if image_guide_value is None:
+                        image_mask_guide_value = None
+                    else:
+                        def rgb_bw_to_rgba_mask(img, thresh=127):
+                            a = img.convert('L').point(lambda p: 255 if p > thresh else 0)  # alpha
+                            out = Image.new('RGBA', img.size, (255, 255, 255, 0))           # white, transparent
+                            out.putalpha(a)                                                 # white where alpha=255
+                            return out
+                        
+                        image_mask_value = rgb_bw_to_rgba_mask(image_mask_value)
+                        image_mask_guide_value = { "background" : image_guide_value, "composite" : None, "layers": [image_mask_value] }
+
+                    image_mask_guide = gr.ImageEditor(
+                        label="Control Image to be Inpainted",
+                        value = image_mask_guide_value, 
+                        type='pil',
+                        sources=["upload", "webcam"],
+                        image_mode='RGB',
+                        layers=False,
+                        brush=gr.Brush(colors=["#FFFFFF"], color_mode="fixed"),
+                        # fixed_canvas= True,
+                        width=800,  
+                        height=800,
+                        # transforms=None,
+                        # interactive=True, 
+                        elem_id="img_editor",
+                        visible= True
+                    )
+                    any_control_image = True
+                else:
+                    image_mask_guide = gr.ImageEditor(value = None, visible = False, elem_id="img_editor")
+
+
+                denoising_strength = gr.Slider(0, 1, value= ui_defaults.get("denoising_strength" ,0.5), step=0.01, label=f"Denoising Strength (the Lower the Closer to the Control {'Image' if image_outputs else 'Video'})", visible = "G" in video_prompt_type_value, show_reset_button= False)
                 keep_frames_video_guide_visible = not image_outputs and  "V" in video_prompt_type_value and not model_def.get("keep_frames_video_guide_not_supported", False)
                 keep_frames_video_guide = gr.Text(value=ui_defaults.get("keep_frames_video_guide","") , visible= keep_frames_video_guide_visible  , scale = 2, label= "Frames to keep in Control Video (empty=All, 1=first, a:b for a range, space to separate values)" ) #, -1=last
 
@@ -7325,11 +7432,10 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non
                             video_guide_outpainting_left = gr.Slider(0, 100, value= video_guide_outpainting_list[2], step=5, label="Left %", show_reset_button= False)
                             video_guide_outpainting_right = gr.Slider(0, 100, value= video_guide_outpainting_list[3], step=5, label="Right %", show_reset_button= False)
                 any_image_mask = image_outputs and vace
-                image_mask = gr.Image(label= "Image Mask Area (for Inpainting, white = Control Area, black = Unchanged)", type ="pil", visible= image_outputs and "V" in video_prompt_type_value and "A" in video_prompt_type_value and not "U" in video_prompt_type_value , height = gallery_height, value= ui_defaults.get("image_mask", None)) 
+                image_mask = gr.Image(label= "Image Mask Area (for Inpainting, white = Control Area, black = Unchanged)", type ="pil", visible= image_mode_value==1 and "V" in video_prompt_type_value and "A" in video_prompt_type_value and not "U" in video_prompt_type_value , height = gallery_height, value= ui_defaults.get("image_mask", None)) 
                 video_mask = gr.Video(label= "Video Mask Area (for Inpainting, white = Control Area, black = Unchanged)", visible= (not image_outputs) and "V" in video_prompt_type_value and "A" in video_prompt_type_value and not "U" in video_prompt_type_value , height = gallery_height, value= ui_defaults.get("video_mask", None)) 
 
                 mask_expand = gr.Slider(-10, 50, value=ui_defaults.get("mask_expand", 0), step=1, label="Expand / Shrink Mask Area", visible= "V" in video_prompt_type_value and "A" in video_prompt_type_value and not "U" in video_prompt_type_value )
-                any_reference_image = vace or phantom or hunyuan_video_custom or hunyuan_video_avatar or infinitetalk or (flux or qwen) and model_reference_image
 
                 image_refs_single_image_mode = model_def.get("one_image_ref_needed", False)
                 image_refs_label = "Start Image" if hunyuan_video_avatar else ("Reference Image" if image_refs_single_image_mode else "Reference Images")  + (" (each Image will start a new Clip)" if infinitetalk else "")
@@ -7424,10 +7530,13 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non
                     visible= True, show_label= not on_demand_prompt_enhancer,
                 )
             with gr.Row():
-                if server_config.get("fit_canvas", 0) == 1:
-                    label = "Max Resolution (As it maybe less depending on video width / height ratio)"
+                fit_canvas = server_config.get("fit_canvas", 0)
+                if fit_canvas == 1:
+                    label = "Outer Box Resolution (one dimension may be less to preserve video W/H ratio)"
+                elif fit_canvas == 2:
+                    label = "Output Resolution (Input Images wil be Cropped if the W/H ratio is different)"
                 else:
-                    label = "Max Resolution (Pixels will be reallocated depending on the output width / height ratio)" 
+                    label = "Resolution Budget (Pixels will be reallocated to preserve Inputs W/H ratio)" 
                 current_resolution_choice = ui_defaults.get("resolution","832x480") if update_form or last_resolution is None else last_resolution
                 model_resolutions = model_def.get("resolutions", None)
                 resolution_choices, current_resolution_choice = get_resolution_choices(current_resolution_choice, model_resolutions)
@@ -7751,7 +7860,7 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non
                             sliding_window_defaults = model_def.get("sliding_window_defaults", {})                            
                             sliding_window_size = gr.Slider(5, get_max_frames(257), value=ui_defaults.get("sliding_window_size", 129), step=4, label="Sliding Window Size")
                             sliding_window_overlap = gr.Slider(sliding_window_defaults.get("overlap_min", 1), sliding_window_defaults.get("overlap_max", 97), value=ui_defaults.get("sliding_window_overlap",sliding_window_defaults.get("overlap_default", 5)), step=sliding_window_defaults.get("overlap_step", 4), label="Windows Frames Overlap (needed to maintain continuity between windows, a higher value will require more windows)")
-                            sliding_window_color_correction_strength = gr.Slider(0, 1, value=ui_defaults.get("sliding_window_color_correction_strength",1), step=0.01, label="Color Correction Strength (match colors of new window with previous one, 0 = disabled)")
+                            sliding_window_color_correction_strength = gr.Slider(0, 1, value=ui_defaults.get("sliding_window_color_correction_strength",1), step=0.01, label="Color Correction Strength (match colors of new window with previous one, 0 = disabled)", visible = True)
                             sliding_window_overlap_noise = gr.Slider(0, 150, value=ui_defaults.get("sliding_window_overlap_noise",20 if vace else 0), step=1, label="Noise to be added to overlapped frames to reduce blur effect" , visible = vace)
                             sliding_window_discard_last_frames = gr.Slider(0, 20, value=ui_defaults.get("sliding_window_discard_last_frames", 0), step=4, label="Discard Last Frames of a Window (that may have bad quality)", visible = True)
 
@@ -7902,7 +8011,7 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non
                 add_to_queue_trigger = gr.Text(visible = False)
 
                 with gr.Column(visible= False) as current_gen_column:
-                    with gr.Accordion("Preview", open=False) as queue_accordion:
+                    with gr.Accordion("Preview", open=False):
                         preview = gr.Image(label="Preview", height=200, show_label= False)
                         preview_trigger = gr.Text(visible= False)
                     gen_info = gr.HTML(visible=False, min_height=1) 
@@ -7947,8 +8056,8 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non
                                       video_guide_outpainting_checkbox, video_guide_outpainting_row, show_advanced, video_info_to_control_video_btn, video_info_to_video_source_btn, sample_solver_row,
                                       video_buttons_row, image_buttons_row, video_postprocessing_tab, audio_remuxing_tab, PP_MMAudio_row, PP_custom_audio_row, 
                                       video_info_to_start_image_btn, video_info_to_end_image_btn, video_info_to_reference_image_btn, video_info_to_image_guide_btn, video_info_to_image_mask_btn,
-                                      NAG_col, speakers_locations_row, embedded_guidance_row, guidance_phases_row, guidance_row, resolution_group, cfg_free_guidance_col, control_net_weights_row, image_mode_tabs, 
-                                      min_frames_if_references_col, video_prompt_type_alignment, prompt_enhancer_btn] + image_start_extra + image_end_extra + image_refs_extra #  presets_column,
+                                      NAG_col, speakers_locations_row, embedded_guidance_row, guidance_phases_row, guidance_row, resolution_group, cfg_free_guidance_col, control_net_weights_row, guide_selection_row, image_mode_tabs, 
+                                      min_frames_if_references_col, video_prompt_type_alignment, prompt_enhancer_btn, tab_inpaint, tab_t2v] + image_start_extra + image_end_extra + image_refs_extra #  presets_column,
         if update_form:
             locals_dict = locals()
             gen_inputs = [state_dict if k=="state" else locals_dict[k]  for k in inputs_names] + [state_dict] + extra_inputs
@@ -7970,10 +8079,10 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non
             image_prompt_type_radio.change(fn=refresh_image_prompt_type_radio, inputs=[state, image_prompt_type, image_prompt_type_radio], outputs=[image_prompt_type, image_start_row, image_end_row, video_source, keep_frames_video_source, image_prompt_type_endcheckbox], show_progress="hidden" ) 
             image_prompt_type_endcheckbox.change(fn=refresh_image_prompt_type_endcheckbox, inputs=[state, image_prompt_type, image_prompt_type_radio, image_prompt_type_endcheckbox], outputs=[image_prompt_type, image_end_row] ) 
             # video_prompt_video_guide_trigger.change(fn=refresh_video_prompt_video_guide_trigger, inputs=[state, video_prompt_type, video_prompt_video_guide_trigger], outputs=[video_prompt_type, video_prompt_type_video_guide, video_guide, keep_frames_video_guide, denoising_strength, video_guide_outpainting_col, video_prompt_type_video_mask, video_mask, mask_expand])
-            video_prompt_type_image_refs.input(fn=refresh_video_prompt_type_image_refs, inputs = [state, video_prompt_type, video_prompt_type_image_refs], outputs = [video_prompt_type, image_refs_row, remove_background_images_ref,  image_refs_relative_size, frames_positions,video_guide_outpainting_col])
-            video_prompt_type_video_guide.input(fn=refresh_video_prompt_type_video_guide, inputs = [state, video_prompt_type, video_prompt_type_video_guide, image_mode], outputs = [video_prompt_type, video_guide, image_guide, keep_frames_video_guide, denoising_strength, video_guide_outpainting_col, video_prompt_type_video_mask, video_mask, image_mask, mask_expand])
-            video_prompt_type_video_guide_alt.input(fn=refresh_video_prompt_type_video_guide_alt, inputs = [state, video_prompt_type, video_prompt_type_video_guide_alt], outputs = [video_prompt_type, video_guide, image_refs_row, denoising_strength ])
-            video_prompt_type_video_mask.input(fn=refresh_video_prompt_type_video_mask, inputs = [state, video_prompt_type, video_prompt_type_video_mask, image_mode], outputs = [video_prompt_type, video_mask, image_mask, mask_expand])
+            video_prompt_type_image_refs.input(fn=refresh_video_prompt_type_image_refs, inputs = [state, video_prompt_type, video_prompt_type_image_refs], outputs = [video_prompt_type, image_refs_row, remove_background_images_ref,  image_refs_relative_size, frames_positions,video_guide_outpainting_col], show_progress="hidden")
+            video_prompt_type_video_guide.input(fn=refresh_video_prompt_type_video_guide, inputs = [state, video_prompt_type, video_prompt_type_video_guide, image_mode], outputs = [video_prompt_type, video_guide, image_guide, keep_frames_video_guide, denoising_strength, video_guide_outpainting_col, video_prompt_type_video_mask, video_mask, image_mask, mask_expand], show_progress="hidden")
+            video_prompt_type_video_guide_alt.input(fn=refresh_video_prompt_type_video_guide_alt, inputs = [state, video_prompt_type, video_prompt_type_video_guide_alt], outputs = [video_prompt_type, video_guide, image_refs_row, denoising_strength ], show_progress="hidden")
+            video_prompt_type_video_mask.input(fn=refresh_video_prompt_type_video_mask, inputs = [state, video_prompt_type, video_prompt_type_video_mask, image_mode], outputs = [video_prompt_type, video_mask, image_mask, mask_expand], show_progress="hidden")
             video_prompt_type_alignment.input(fn=refresh_video_prompt_type_alignment, inputs = [state, video_prompt_type, video_prompt_type_alignment], outputs = [video_prompt_type])
             multi_prompts_gen_type.select(fn=refresh_prompt_labels, inputs=[multi_prompts_gen_type, image_mode], outputs=[prompt, wizard_prompt, image_end], show_progress="hidden")
             video_guide_outpainting_top.input(fn=update_video_guide_outpainting, inputs=[video_guide_outpainting, video_guide_outpainting_top, gr.State(0)], outputs = [video_guide_outpainting], trigger_mode="multiple" )
@@ -7984,8 +8093,8 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non
             show_advanced.change(fn=switch_advanced, inputs=[state, show_advanced, lset_name], outputs=[advanced_row, preset_buttons_rows, refresh_lora_btn, refresh2_row ,lset_name]).then(
                 fn=switch_prompt_type, inputs = [state, wizard_prompt_activated_var, wizard_variables_var, prompt, wizard_prompt, *prompt_vars], outputs = [wizard_prompt_activated_var, wizard_variables_var, prompt, wizard_prompt, prompt_column_advanced, prompt_column_wizard, prompt_column_wizard_vars, *prompt_vars])
             queue_df.select( fn=handle_celll_selection, inputs=state, outputs=[queue_df, modal_image_display, modal_container])
-            gr.on( triggers=[output.change, output.select], fn=select_video, inputs=[state, output], outputs=[last_choice, video_info, video_buttons_row, image_buttons_row, video_postprocessing_tab, audio_remuxing_tab])
-            preview_trigger.change(refresh_preview, inputs= [state], outputs= [preview])
+            gr.on( triggers=[output.change, output.select], fn=select_video, inputs=[state, output], outputs=[last_choice, video_info, video_buttons_row, image_buttons_row, video_postprocessing_tab, audio_remuxing_tab], show_progress="hidden")
+            preview_trigger.change(refresh_preview, inputs= [state], outputs= [preview], show_progress="hidden")
             PP_MMAudio_setting.change(fn = lambda value : [gr.update(visible = value == 1), gr.update(visible = value == 0)] , inputs = [PP_MMAudio_setting], outputs = [PP_MMAudio_row, PP_custom_audio_row] )
             def refresh_status_async(state, progress=gr.Progress()):
                 gen = get_gen_info(state)
@@ -8017,7 +8126,9 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non
 
             output_trigger.change(refresh_gallery,
                 inputs = [state], 
-                outputs = [output, gen_info, generate_btn, add_to_queue_btn, current_gen_column, current_gen_buttons_row, queue_df, abort_btn, onemorewindow_btn])
+                outputs = [output, gen_info, generate_btn, add_to_queue_btn, current_gen_column, current_gen_buttons_row, queue_df, abort_btn, onemorewindow_btn],
+                show_progress="hidden"
+                )
 
 
             preview_column_no.input(show_preview_column_modal, inputs=[state, preview_column_no], outputs=[preview_column_no, modal_image_display, modal_container])
@@ -8033,7 +8144,8 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non
 
             gr.on( triggers=[video_info_extract_settings_btn.click, video_info_extract_image_settings_btn.click], fn=validate_wizard_prompt,
                 inputs= [state, wizard_prompt_activated_var, wizard_variables_var,  prompt, wizard_prompt, *prompt_vars] ,
-                outputs= [prompt]
+                outputs= [prompt],
+                show_progress="hidden",
             ).then(fn=save_inputs,
                 inputs =[target_state] + gen_inputs,
                 outputs= None
@@ -8042,7 +8154,8 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non
 
             prompt_enhancer_btn.click(fn=validate_wizard_prompt,
                 inputs= [state, wizard_prompt_activated_var, wizard_variables_var,  prompt, wizard_prompt, *prompt_vars] ,
-                outputs= [prompt]
+                outputs= [prompt],
+                show_progress="hidden",
             ).then(fn=save_inputs,
                 inputs =[target_state] + gen_inputs,
                 outputs= None
@@ -8050,7 +8163,8 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non
 
             saveform_trigger.change(fn=validate_wizard_prompt,
                 inputs= [state, wizard_prompt_activated_var, wizard_variables_var,  prompt, wizard_prompt, *prompt_vars] ,
-                outputs= [prompt]
+                outputs= [prompt],
+                show_progress="hidden",
             ).then(fn=save_inputs,
                 inputs =[target_state] + gen_inputs,
                 outputs= None
@@ -8065,14 +8179,14 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non
             video_info_to_video_source_btn.click(fn=video_to_source_video, inputs =[state, output, last_choice], outputs = [video_source] )
             video_info_to_start_image_btn.click(fn=image_to_ref_image_add, inputs =[state, output, last_choice, image_start, gr.State("Start Image")], outputs = [image_start] )
             video_info_to_end_image_btn.click(fn=image_to_ref_image_add, inputs =[state, output, last_choice, image_end, gr.State("End Image")], outputs = [image_end] )
-            video_info_to_image_guide_btn.click(fn=image_to_ref_image_set, inputs =[state, output, last_choice, image_guide, gr.State("Control Image")], outputs = [image_guide] )
+            video_info_to_image_guide_btn.click(fn=image_to_ref_image_guide, inputs =[state, output, last_choice], outputs = [image_guide, image_mask_guide] )
             video_info_to_image_mask_btn.click(fn=image_to_ref_image_set, inputs =[state, output, last_choice, image_mask, gr.State("Image Mask")], outputs = [image_mask] )
             video_info_to_reference_image_btn.click(fn=image_to_ref_image_add, inputs =[state, output, last_choice, image_refs, gr.State("Ref Image")],  outputs = [image_refs] )
             video_info_postprocessing_btn.click(fn=apply_post_processing, inputs =[state, output, last_choice, PP_temporal_upsampling, PP_spatial_upsampling, PP_film_grain_intensity, PP_film_grain_saturation], outputs = [mode, generate_trigger, add_to_queue_trigger ] )
             video_info_remux_audio_btn.click(fn=remux_audio, inputs =[state, output, last_choice, PP_MMAudio_setting, PP_MMAudio_prompt, PP_MMAudio_neg_prompt, PP_MMAudio_seed, PP_repeat_generation, PP_custom_audio], outputs = [mode, generate_trigger, add_to_queue_trigger ] )
             save_lset_btn.click(validate_save_lset, inputs=[state, lset_name], outputs=[apply_lset_btn, refresh_lora_btn, delete_lset_btn, save_lset_btn,confirm_save_lset_btn, cancel_lset_btn, save_lset_prompt_drop])
             delete_lset_btn.click(validate_delete_lset, inputs=[state, lset_name], outputs=[apply_lset_btn, refresh_lora_btn, delete_lset_btn, save_lset_btn,confirm_delete_lset_btn, cancel_lset_btn ])
-            confirm_save_lset_btn.click(fn=validate_wizard_prompt, inputs =[state, wizard_prompt_activated_var, wizard_variables_var, prompt, wizard_prompt, *prompt_vars] , outputs= [prompt]).then(
+            confirm_save_lset_btn.click(fn=validate_wizard_prompt, inputs =[state, wizard_prompt_activated_var, wizard_variables_var, prompt, wizard_prompt, *prompt_vars] , outputs= [prompt], show_progress="hidden",).then(
                 fn=save_inputs,
                 inputs =[target_state] + gen_inputs,
                 outputs= None).then(
@@ -8087,7 +8201,8 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non
             lset_name.select(fn=update_lset_type, inputs=[state, lset_name], outputs=save_lset_prompt_drop)
             export_settings_from_file_btn.click(fn=validate_wizard_prompt,
                 inputs= [state, wizard_prompt_activated_var, wizard_variables_var,  prompt, wizard_prompt, *prompt_vars] ,
-                outputs= [prompt]
+                outputs= [prompt],
+                show_progress="hidden",
             ).then(fn=save_inputs,
                 inputs =[target_state] + gen_inputs,
                 outputs= None
@@ -8104,7 +8219,8 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non
             image_mode_tabs.select(fn=record_image_mode_tab, inputs=[state], outputs= None
             ).then(fn=validate_wizard_prompt,
                 inputs= [state, wizard_prompt_activated_var, wizard_variables_var,  prompt, wizard_prompt, *prompt_vars] ,
-                outputs= [prompt]
+                outputs= [prompt],
+                show_progress="hidden",
             ).then(fn=save_inputs,
                 inputs =[target_state] + gen_inputs,
                 outputs= None
@@ -8112,7 +8228,8 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non
 
             settings_file.upload(fn=validate_wizard_prompt,
                 inputs= [state, wizard_prompt_activated_var, wizard_variables_var,  prompt, wizard_prompt, *prompt_vars] ,
-                outputs= [prompt]
+                outputs= [prompt],
+                show_progress="hidden",
             ).then(fn=save_inputs,
                 inputs =[target_state] + gen_inputs,
                 outputs= None
@@ -8126,17 +8243,20 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non
 
             refresh_form_trigger.change(fn= fill_inputs, 
                 inputs=[state],
-                outputs=gen_inputs + extra_inputs
+                outputs=gen_inputs + extra_inputs,
+                show_progress= "full" if args.debug_gen_form else "hidden",
             ).then(fn=validate_wizard_prompt,
                 inputs= [state, wizard_prompt_activated_var, wizard_variables_var,  prompt, wizard_prompt, *prompt_vars],
-                outputs= [prompt]
+                outputs= [prompt],
+                show_progress="hidden",
             )                
 
             model_family.input(fn=change_model_family, inputs=[state, model_family], outputs= [model_choice])
 
             model_choice.change(fn=validate_wizard_prompt,
                 inputs= [state, wizard_prompt_activated_var, wizard_variables_var,  prompt, wizard_prompt, *prompt_vars] ,
-                outputs= [prompt]
+                outputs= [prompt],
+                show_progress="hidden",
             ).then(fn=save_inputs,
                 inputs =[target_state] + gen_inputs,
                 outputs= None
@@ -8145,7 +8265,8 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non
                 outputs= [header]
             ).then(fn= fill_inputs, 
                 inputs=[state],
-                outputs=gen_inputs + extra_inputs
+                outputs=gen_inputs + extra_inputs,
+                show_progress="full" if args.debug_gen_form else "hidden",
             ).then(fn= preload_model_when_switching, 
                 inputs=[state],
                 outputs=[gen_status])
@@ -8154,13 +8275,15 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non
 
             generate_trigger.change(fn=validate_wizard_prompt,
                 inputs= [state, wizard_prompt_activated_var, wizard_variables_var,  prompt, wizard_prompt, *prompt_vars] ,
-                outputs= [prompt]
+                outputs= [prompt],
+                show_progress="hidden",
             ).then(fn=save_inputs,
                 inputs =[target_state] + gen_inputs,
                 outputs= None
             ).then(fn=process_prompt_and_add_tasks,
                 inputs = [state, model_choice],
-                outputs= queue_df
+                outputs= queue_df,
+                show_progress="hidden",
             ).then(fn=prepare_generate_video,
                 inputs= [state],
                 outputs= [generate_btn, add_to_queue_btn, current_gen_column, current_gen_buttons_row]
@@ -8170,10 +8293,12 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non
             ).then(
                 fn=lambda s: gr.Accordion(open=True) if len(get_gen_info(s).get("queue", [])) > 1 else gr.update(),
                 inputs=[state],
-                outputs=[queue_accordion]
+                outputs=[queue_accordion],
+                show_progress="hidden",
             ).then(fn=process_tasks,
                 inputs= [state],
                 outputs= [preview_trigger, output_trigger], 
+                show_progress="hidden",
             ).then(finalize_generation,
                 inputs= [state], 
                 outputs= [output, abort_btn, generate_btn, add_to_queue_btn, current_gen_column, gen_info]
@@ -8280,17 +8405,20 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non
             # gr.on(triggers=[add_to_queue_btn.click, add_to_queue_trigger.change],fn=validate_wizard_prompt, 
             add_to_queue_trigger.change(fn=validate_wizard_prompt, 
                 inputs =[state, wizard_prompt_activated_var, wizard_variables_var,  prompt, wizard_prompt, *prompt_vars] ,
-                outputs= [prompt]
+                outputs= [prompt],
+                show_progress="hidden",
             ).then(fn=save_inputs,
                 inputs =[target_state] + gen_inputs,
                 outputs= None
             ).then(fn=process_prompt_and_add_tasks,
                 inputs = [state, model_choice],
-                outputs=queue_df
+                outputs=queue_df,
+                show_progress="hidden",
             ).then(
                 fn=lambda s: gr.Accordion(open=True) if len(get_gen_info(s).get("queue", [])) > 1 else gr.update(),
                 inputs=[state],
-                outputs=[queue_accordion]
+                outputs=[queue_accordion],
+                show_progress="hidden",
             ).then(
                 fn=update_status,
                 inputs = [state],
@@ -8302,8 +8430,8 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non
                 outputs=[modal_container]
             )
 
-    return ( state, loras_choices, lset_name, resolution,
-             video_guide, image_guide, video_mask, image_mask, image_refs, refresh_form_trigger  
+    return ( state, loras_choices, lset_name, resolution, refresh_form_trigger,
+            #  video_guide, image_guide, video_mask, image_mask, image_refs,   
             ) 
  
 
@@ -8339,8 +8467,9 @@ def generate_configuration_tab(state, blocks, header, model_family, model_choice
 
                 fit_canvas_choice = gr.Dropdown(
                     choices=[
-                        ("Dimensions correspond to the Pixels Budget (as the Prompt Image/Video will be resized to match this pixels budget, output video height or width may exceed the requested dimensions )", 0),
-                        ("Dimensions correspond to the Maximum Width and Height (as the Prompt Image/Video will be resized to fit into these dimensions, the output video may be smaller)", 1),
+                        ("Dimensions correspond to the Pixels Budget (as the Prompt Image/Video will be Resized to match this pixels Budget, output video height or width may exceed the requested dimensions )", 0),
+                        ("Dimensions correspond to the Maximum Width and Height (as the Prompt Image/Video will be Resized to fit into these dimensions, the output video may be smaller)", 1),
+                        ("Dimensions correspond to the Output Width and Height (as the Prompt Image/Video will be Cropped to fit exactly these dimensions)", 2),
                     ],
                     value= server_config.get("fit_canvas", 0),
                     label="Generated Video Dimensions when Prompt contains an Image or a Video",
@@ -9231,9 +9360,17 @@ def create_ui():
                 console.log('Events dispatched for column:', index);
                 }
         };
-        
         console.log('sendColIndex function attached to window');
-    }
+
+        // cancel wheel usage inside image editor    
+        const hit = n => n?.id === "img_editor" || n?.classList?.contains("wheel-pass");
+        addEventListener("wheel", e => {
+            const path = e.composedPath?.() || (() => { let a=[],n=e.target; for(;n;n=n.parentNode||n.host) a.push(n); return a; })();
+            if (path.some(hit)) e.stopImmediatePropagation();
+        }, { capture: true, passive: true });
+
+        }    
+
     """
     if server_config.get("display_stats", 0) == 1:
         from shared.utils.stats import SystemStatsApp
@@ -9264,13 +9401,13 @@ def create_ui():
                         stats_element = stats_app.get_gradio_element()
 
                 with gr.Row():
-                    (   state, loras_choices, lset_name, resolution,
-                        video_guide, image_guide, video_mask, image_mask, image_refs, refresh_form_trigger
+                    (   state, loras_choices, lset_name, resolution, refresh_form_trigger
+                        # video_guide, image_guide, video_mask, image_mask, image_refs, 
                     ) = generate_video_tab(model_family=model_family, model_choice=model_choice, header=header, main = main, main_tabs =main_tabs)
             with gr.Tab("Guides", id="info") as info_tab:
                 generate_info_tab()
             with gr.Tab("Video Mask Creator", id="video_mask_creator") as video_mask_creator:
-                matanyone_app.display(main_tabs, tab_state, server_config, video_guide, image_guide, video_mask, image_mask, image_refs)
+                matanyone_app.display(main_tabs, tab_state, state, refresh_form_trigger, server_config, get_current_model_settings) #, video_guide, image_guide, video_mask, image_mask, image_refs)
             if not args.lock_config:
                 with gr.Tab("Downloads", id="downloads") as downloads_tab:
                     generate_download_tab(lset_name, loras_choices, state)

From e7c08d12c846ddf4817013e1b112b1ecf183ada3 Mon Sep 17 00:00:00 2001
From: DeepBeepMeep <deepbeepmeep@yahoo.com>
Date: Wed, 10 Sep 2025 01:47:55 +0200
Subject: [PATCH 2/3] fixed unwanted discontinuity with at the end of first
 sliding window with InfiniteTalk

---
 models/wan/any2video.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/models/wan/any2video.py b/models/wan/any2video.py
index bb91dc6..1e62c35 100644
--- a/models/wan/any2video.py
+++ b/models/wan/any2video.py
@@ -539,7 +539,7 @@ class WanAny2V:
                         new_shot = "Q" in video_prompt_type
                     else:
                         if pre_video_frame is None:
-                            new_shot = True
+                            new_shot = "Q" in video_prompt_type
                         else:
                             if input_ref_images is None:
                                 input_ref_images, new_shot = [pre_video_frame], False

From 9fa267087b2dfdba651fd173325537f031edf91d Mon Sep 17 00:00:00 2001
From: DeepBeepMeep <deepbeepmeep@yahoo.com>
Date: Thu, 11 Sep 2025 21:23:05 +0200
Subject: [PATCH 3/3] Flux Festival

---
 README.md                         |  9 +++-
 defaults/flux_dev_umo.json        | 24 ++++++++++
 defaults/flux_dev_uso.json        |  2 +-
 defaults/flux_srpo.json           | 15 ++++++
 defaults/flux_srpo_uso.json       | 17 +++++++
 models/flux/flux_handler.py       | 22 +++++++++
 models/flux/flux_main.py          | 77 +++++++++++++++++++++++++------
 models/flux/model.py              | 15 ++++++
 models/flux/sampling.py           | 57 +++++++++++++++++++++--
 models/flux/util.py               | 32 +++++++++++++
 models/qwen/pipeline_qwenimage.py | 19 +++++---
 models/qwen/qwen_handler.py       |  1 +
 models/wan/any2video.py           |  2 +-
 models/wan/wan_handler.py         |  1 +
 wgp.py                            | 62 ++++++++++++++++---------
 15 files changed, 305 insertions(+), 50 deletions(-)
 create mode 100644 defaults/flux_dev_umo.json
 create mode 100644 defaults/flux_srpo.json
 create mode 100644 defaults/flux_srpo_uso.json

diff --git a/README.md b/README.md
index fc3d76c..d33b6dc 100644
--- a/README.md
+++ b/README.md
@@ -20,7 +20,7 @@ WanGP supports the Wan (and derived models), Hunyuan Video and LTV Video models
 **Follow DeepBeepMeep on Twitter/X to get the Latest News**: https://x.com/deepbeepmeep
 
 ## 🔥 Latest Updates : 
-### September 5 2025: WanGP v8.5 - Wanna be a Cropper or a Painter ?
+### September 11 2025: WanGP v8.5/8.55 - Wanna be a Cropper or a Painter ?
 
 I have done some intensive internal refactoring of the generation pipeline to ease support of existing models or add new models. Nothing really visible but this makes WanGP is little more future proof.
 
@@ -38,6 +38,13 @@ Doing more sophisticated thing Vace Image Editor works very well too: try Image
 
 For the best quality I recommend to set in *Quality Tab* the option: "*Generate a 9 Frames Long video...*" 
 
+**update 8.55**: Flux Festival
+- **Inpainting Mode** also added for *Flux Kontext*
+- **Flux SRPO** : new finetune with x3 better quality vs Flux Dev according to its authors. I have also created a *Flux SRPO USO* finetune which is certainly the best open source *Style Transfer* tool available
+- **Flux UMO**: model specialized in combining multiple reference objects / people together. Works quite well at 768x768
+
+Good luck with finding your way through all the Flux models names !
+
 ### September 5 2025: WanGP v8.4 - Take me to Outer Space
 You have probably seen these short AI generated movies created using *Nano Banana* and the *First Frame - Last Frame* feature of *Kling 2.0*. The idea is to generate an image, modify a part of it with Nano Banana and give the these two images to Kling that will generate the Video between these two images, use now the previous Last Frame as the new First Frame, rinse and repeat and you get a full movie.
 
diff --git a/defaults/flux_dev_umo.json b/defaults/flux_dev_umo.json
new file mode 100644
index 0000000..57164bb
--- /dev/null
+++ b/defaults/flux_dev_umo.json
@@ -0,0 +1,24 @@
+{
+    "model": {
+        "name": "Flux 1 Dev UMO 12B",
+        "architecture": "flux",
+        "description": "FLUX.1 Dev UMO is a model that can Edit Images with a specialization in combining multiple image references (resized internally at 512x512 max) to produce an Image output. Best Image preservation at 768x768 Resolution Output.",
+        "URLs": "flux",
+		"flux-model": "flux-dev-umo",		
+		"loras": ["https://huggingface.co/DeepBeepMeep/Flux/resolve/main/flux1-dev-UMO_dit_lora_bf16.safetensors"],
+		"resolutions":  [ ["1024x1024 (1:1)", "1024x1024"],
+						["768x1024 (3:4)", "768x1024"],
+						["1024x768 (4:3)", "1024x768"],
+						["512x1024 (1:2)", "512x1024"],
+						["1024x512 (2:1)", "1024x512"],
+						["768x768 (1:1)", "768x768"],
+						["768x512 (3:2)", "768x512"],
+						["512x768 (2:3)", "512x768"]]
+    },	
+	"prompt": "the man is wearing a hat",
+	"embedded_guidance_scale": 4,
+    "resolution": "768x768",
+    "batch_size": 1
+}
+
+	
\ No newline at end of file
diff --git a/defaults/flux_dev_uso.json b/defaults/flux_dev_uso.json
index 0cd7b82..806dd7e 100644
--- a/defaults/flux_dev_uso.json
+++ b/defaults/flux_dev_uso.json
@@ -2,7 +2,7 @@
     "model": {
         "name": "Flux 1 Dev USO 12B",
         "architecture": "flux",
-        "description": "FLUX.1 Dev USO is a model specialized to Edit Images with a specialization in Style Transfers (up to two).",
+        "description": "FLUX.1 Dev USO is a model that can Edit Images with a specialization in Style Transfers (up to two).",
 		"modules": [ ["https://huggingface.co/DeepBeepMeep/Flux/resolve/main/flux1-dev-USO_projector_bf16.safetensors"]],
         "URLs": "flux",
 		"loras": ["https://huggingface.co/DeepBeepMeep/Flux/resolve/main/flux1-dev-USO_dit_lora_bf16.safetensors"],
diff --git a/defaults/flux_srpo.json b/defaults/flux_srpo.json
new file mode 100644
index 0000000..59f07c6
--- /dev/null
+++ b/defaults/flux_srpo.json
@@ -0,0 +1,15 @@
+{
+    "model": {
+        "name": "Flux 1 SRPO Dev 12B",
+        "architecture": "flux",
+        "description": "By fine-tuning the FLUX.1.dev model with optimized denoising and online reward adjustment, SRPO improves its human-evaluated realism and aesthetic quality by over 3x.",
+        "URLs": [
+            "https://huggingface.co/DeepBeepMeep/Flux/resolve/main/flux1-srpo-dev_bf16.safetensors",
+            "https://huggingface.co/DeepBeepMeep/Flux/resolve/main/flux1-srpo-dev_quanto_bf16_int8.safetensors"
+        ],
+        "flux-model": "flux-dev"
+    },
+    "prompt": "draw a hat",
+    "resolution": "1024x1024",
+    "batch_size": 1
+}
\ No newline at end of file
diff --git a/defaults/flux_srpo_uso.json b/defaults/flux_srpo_uso.json
new file mode 100644
index 0000000..ddfe50d
--- /dev/null
+++ b/defaults/flux_srpo_uso.json
@@ -0,0 +1,17 @@
+{
+    "model": {
+        "name": "Flux 1 SRPO USO 12B",
+        "architecture": "flux",
+        "description": "FLUX.1 SRPO USO is a model that can Edit Images with a specialization in Style Transfers (up to two). It leverages the improved Image quality brought by the SRPO process",
+		"modules": [ "flux_dev_uso"],
+        "URLs": "flux_srpo",
+		"loras": "flux_dev_uso",
+		"flux-model": "flux-dev-uso"		
+    },
+	"prompt": "the man is wearing a hat",
+	"embedded_guidance_scale": 4,
+    "resolution": "1024x1024",
+    "batch_size": 1
+}
+
+	
\ No newline at end of file
diff --git a/models/flux/flux_handler.py b/models/flux/flux_handler.py
index c468d5a..808369f 100644
--- a/models/flux/flux_handler.py
+++ b/models/flux/flux_handler.py
@@ -13,6 +13,7 @@ class family_handler():
         flux_schnell = flux_model == "flux-schnell" 
         flux_chroma = flux_model == "flux-chroma" 
         flux_uso = flux_model == "flux-dev-uso"
+        flux_umo = flux_model == "flux-dev-umo"
         flux_kontext = flux_model == "flux-dev-kontext"
         
         extra_model_def = {
@@ -35,6 +36,7 @@ class family_handler():
             }
         
         if flux_kontext:
+            extra_model_def["inpaint_support"] = True
             extra_model_def["image_ref_choices"] = {
                 "choices": [
                     ("None", ""),
@@ -43,6 +45,15 @@ class family_handler():
                     ],
                 "letters_filter": "KI",
             }
+            extra_model_def["background_removal_label"]= "Remove Backgrounds only behind People / Objects except main Subject / Landscape" 
+        elif flux_umo:
+            extra_model_def["image_ref_choices"] = {
+                "choices": [
+                    ("Conditional Images are People / Objects", "I"),
+                    ],
+                "letters_filter": "I",
+                "visible": False
+            }
 
 
         extra_model_def["lock_image_refs_ratios"] = True
@@ -131,10 +142,14 @@ class family_handler():
                 video_prompt_type = video_prompt_type.replace("I", "KI")
                 ui_defaults["video_prompt_type"] = video_prompt_type 
 
+        if settings_version < 2.34:
+            ui_defaults["denoising_strength"] = 1.
+
     @staticmethod
     def update_default_settings(base_model_type, model_def, ui_defaults):
         flux_model = model_def.get("flux-model", "flux-dev")
         flux_uso = flux_model == "flux-dev-uso"
+        flux_umo = flux_model == "flux-dev-umo"
         flux_kontext = flux_model == "flux-dev-kontext"
         ui_defaults.update({
             "embedded_guidance":  2.5,
@@ -143,5 +158,12 @@ class family_handler():
         if flux_kontext or flux_uso:
             ui_defaults.update({
                 "video_prompt_type": "KI",
+                "denoising_strength": 1.,
             })
+        elif flux_umo:
+            ui_defaults.update({
+                "video_prompt_type": "I",
+                "remove_background_images_ref": 0,
+            })
+        
 
diff --git a/models/flux/flux_main.py b/models/flux/flux_main.py
index 4d7c67d..6863711 100644
--- a/models/flux/flux_main.py
+++ b/models/flux/flux_main.py
@@ -23,6 +23,35 @@ from .util import (
 )
 
 from PIL import Image
+def preprocess_ref(raw_image: Image.Image, long_size: int = 512):
+    # 获取原始图像的宽度和高度
+    image_w, image_h = raw_image.size
+
+    # 计算长边和短边
+    if image_w >= image_h:
+        new_w = long_size
+        new_h = int((long_size / image_w) * image_h)
+    else:
+        new_h = long_size
+        new_w = int((long_size / image_h) * image_w)
+
+    # 按新的宽高进行等比例缩放
+    raw_image = raw_image.resize((new_w, new_h), resample=Image.LANCZOS)
+    target_w = new_w // 16 * 16
+    target_h = new_h // 16 * 16
+
+    # 计算裁剪的起始坐标以实现中心裁剪
+    left = (new_w - target_w) // 2
+    top = (new_h - target_h) // 2
+    right = left + target_w
+    bottom = top + target_h
+
+    # 进行中心裁剪
+    raw_image = raw_image.crop((left, top, right, bottom))
+
+    # 转换为 RGB 模式
+    raw_image = raw_image.convert("RGB")
+    return raw_image
 
 def stitch_images(img1, img2):
     # Resize img2 to match img1's height
@@ -67,7 +96,7 @@ class model_factory:
         # self.name= "flux-schnell"
         source =  model_def.get("source", None)
         self.model = load_flow_model(self.name, model_filename[0] if source is None else source, torch_device)
-
+        self.model_def = model_def 
         self.vae = load_ae(self.name, device=torch_device)
 
         siglip_processor = siglip_model = feature_embedder = None
@@ -109,10 +138,12 @@ class model_factory:
     def generate(
             self,
             seed: int | None = None,
-            input_prompt: str = "replace the logo with the text 'Black Forest Labs'",
+            input_prompt: str = "replace the logo with the text 'Black Forest Labs'",            
             n_prompt: str = None,
             sampling_steps: int = 20,
             input_ref_images = None,
+            image_guide= None,
+            image_mask= None,
             width= 832,
             height=480,
             embedded_guidance_scale: float = 2.5,
@@ -123,7 +154,8 @@ class model_factory:
             batch_size = 1,
             video_prompt_type = "",
             joint_pass = False,
-            image_refs_relative_size = 100,       
+            image_refs_relative_size = 100,
+            denoising_strength = 1.,
             **bbargs
     ):
             if self._interrupt:
@@ -132,8 +164,16 @@ class model_factory:
             if n_prompt is None or len(n_prompt) == 0: n_prompt = "low quality, ugly, unfinished, out of focus, deformed, disfigure, blurry, smudged, restricted palette, flat colors"
             device="cuda"
             flux_dev_uso = self.name in ['flux-dev-uso']
-            image_stiching =  not self.name in ['flux-dev-uso'] #and False
+            flux_dev_umo = self.name in ['flux-dev-umo']
+            latent_stiching =  self.name in ['flux-dev-uso', 'flux-dev-umo'] 
+
+            lock_dimensions=  False
+
             input_ref_images = [] if input_ref_images is None else input_ref_images[:]
+            if flux_dev_umo:
+                ref_long_side = 512 if len(input_ref_images) <= 1 else 320
+                input_ref_images = [preprocess_ref(img, ref_long_side) for img in input_ref_images]
+                lock_dimensions = True
             ref_style_imgs = []
             if "I" in video_prompt_type and len(input_ref_images) > 0: 
                 if flux_dev_uso :
@@ -143,22 +183,26 @@ class model_factory:
                     elif len(input_ref_images) > 1 :
                         ref_style_imgs = input_ref_images[-1:]
                         input_ref_images = input_ref_images[:-1]
-                if image_stiching:
+
+                if latent_stiching:
+                    # latents stiching with resize 
+                    if not lock_dimensions :
+                        for i in range(len(input_ref_images)):
+                            w, h = input_ref_images[i].size
+                            image_height, image_width = calculate_new_dimensions(int(height*image_refs_relative_size/100), int(width*image_refs_relative_size/100), h, w, 0)
+                            input_ref_images[i] = input_ref_images[i].resize((image_width, image_height), resample=Image.Resampling.LANCZOS) 
+                else:
                     # image stiching method
                     stiched = input_ref_images[0]
                     for new_img in input_ref_images[1:]:
                         stiched = stitch_images(stiched, new_img)
                     input_ref_images  = [stiched]
-                else:
-                    # latents stiching with resize 
-                    for i in range(len(input_ref_images)):
-                        w, h = input_ref_images[i].size
-                        image_height, image_width = calculate_new_dimensions(int(height*image_refs_relative_size/100), int(width*image_refs_relative_size/100), h, w, fit_into_canvas)
-                        input_ref_images[i] = input_ref_images[i].resize((image_width, image_height), resample=Image.Resampling.LANCZOS) 
+            elif image_guide is not None:
+                input_ref_images = [image_guide] 
             else:
                 input_ref_images = None
 
-            if flux_dev_uso :
+            if self.name in ['flux-dev-uso', 'flux-dev-umo']  :
                 inp, height, width = prepare_multi_ip(
                     ae=self.vae,
                     img_cond_list=input_ref_images,
@@ -177,6 +221,7 @@ class model_factory:
                     bs=batch_size,
                     seed=seed,
                     device=device,
+                    img_mask=image_mask,
                 )
 
             inp.update(prepare_prompt(self.t5, self.clip, batch_size, input_prompt))
@@ -198,13 +243,19 @@ class model_factory:
                 return unpack(x.float(), height, width) 
 
             # denoise initial noise
-            x = denoise(self.model, **inp, timesteps=timesteps, guidance=embedded_guidance_scale, real_guidance_scale =guide_scale, callback=callback, pipeline=self, loras_slists= loras_slists, unpack_latent = unpack_latent, joint_pass = joint_pass)
+            x = denoise(self.model, **inp, timesteps=timesteps, guidance=embedded_guidance_scale, real_guidance_scale =guide_scale, callback=callback, pipeline=self, loras_slists= loras_slists, unpack_latent = unpack_latent, joint_pass = joint_pass, denoising_strength = denoising_strength)
             if x==None: return None
             # decode latents to pixel space
             x = unpack_latent(x)
             with torch.autocast(device_type=device, dtype=torch.bfloat16):
                 x = self.vae.decode(x)
 
+            if image_mask is not None:
+                from shared.utils.utils import convert_image_to_tensor
+                img_msk_rebuilt = inp["img_msk_rebuilt"]
+                img= convert_image_to_tensor(image_guide) 
+                x = img.squeeze(2) * (1 - img_msk_rebuilt) + x.to(img) * img_msk_rebuilt 
+
             x = x.clamp(-1, 1)
             x = x.transpose(0, 1)
             return x
diff --git a/models/flux/model.py b/models/flux/model.py
index c4642d0..c5f7a24 100644
--- a/models/flux/model.py
+++ b/models/flux/model.py
@@ -190,6 +190,21 @@ class Flux(nn.Module):
                             v = swap_scale_shift(v)
                         k = k.replace("norm_out.linear", "final_layer.adaLN_modulation.1")            
                 new_sd[k] = v
+        # elif not first_key.startswith("diffusion_model.") and not first_key.startswith("transformer."):
+        #     for k,v in sd.items():
+        #         if "double" in k:
+        #             k = k.replace(".processor.proj_lora1.", ".img_attn.proj.lora_")
+        #             k = k.replace(".processor.proj_lora2.", ".txt_attn.proj.lora_")
+        #             k = k.replace(".processor.qkv_lora1.", ".img_attn.qkv.lora_")
+        #             k = k.replace(".processor.qkv_lora2.", ".txt_attn.qkv.lora_")
+        #         else:
+        #             k = k.replace(".processor.qkv_lora.", ".linear1_qkv.lora_")
+        #             k = k.replace(".processor.proj_lora.", ".linear2.lora_")
+
+        #         k = "diffusion_model." + k
+        #         new_sd[k] = v
+        #     from mmgp import safetensors2
+        #     safetensors2.torch_write_file(new_sd, "fff.safetensors")
         else:
             new_sd = sd
         return new_sd    
diff --git a/models/flux/sampling.py b/models/flux/sampling.py
index f43ae15..1b4813a 100644
--- a/models/flux/sampling.py
+++ b/models/flux/sampling.py
@@ -138,10 +138,12 @@ def prepare_kontext(
     target_width: int | None = None,
     target_height: int | None = None,
     bs: int = 1,
-
+    img_mask = None,
 ) -> tuple[dict[str, Tensor], int, int]:
     # load and encode the conditioning image
 
+    res_match_output = img_mask is not None
+
     img_cond_seq = None
     img_cond_seq_ids = None
     if img_cond_list == None: img_cond_list = []
@@ -150,9 +152,11 @@ def prepare_kontext(
     for cond_no, img_cond in enumerate(img_cond_list): 
         width, height = img_cond.size
         aspect_ratio = width / height
-
-        # Kontext is trained on specific resolutions, using one of them is recommended
-        _, width, height = min((abs(aspect_ratio - w / h), w, h) for w, h in PREFERED_KONTEXT_RESOLUTIONS)
+        if res_match_output:
+            width, height = target_width, target_height
+        else:
+            # Kontext is trained on specific resolutions, using one of them is recommended
+            _, width, height = min((abs(aspect_ratio - w / h), w, h) for w, h in PREFERED_KONTEXT_RESOLUTIONS)
         width = 2 * int(width / 16)
         height = 2 * int(height / 16)
 
@@ -193,6 +197,19 @@ def prepare_kontext(
         "img_cond_seq": img_cond_seq,
         "img_cond_seq_ids": img_cond_seq_ids,
     }
+    if img_mask is not None:
+        from shared.utils.utils import convert_image_to_tensor, convert_tensor_to_image
+        # image_height, image_width = calculate_new_dimensions(ref_height, ref_width, image_height, image_width, False, block_size=multiple_of)
+        image_mask_latents = convert_image_to_tensor(img_mask.resize((target_width // 16, target_height // 16), resample=Image.Resampling.LANCZOS))
+        image_mask_latents = torch.where(image_mask_latents>-0.5, 1., 0. )[0:1]
+        image_mask_rebuilt = image_mask_latents.repeat_interleave(16, dim=-1).repeat_interleave(16, dim=-2).unsqueeze(0)
+        convert_tensor_to_image( image_mask_rebuilt.squeeze(0).repeat(3,1,1)).save("mmm.png")
+        image_mask_latents = image_mask_latents.reshape(1, -1, 1).to(device)        
+        return_dict.update({
+            "img_msk_latents": image_mask_latents,
+            "img_msk_rebuilt": image_mask_rebuilt,
+        })
+
     img = get_noise(
         bs,
         target_height,
@@ -264,6 +281,9 @@ def denoise(
     loras_slists=None,
     unpack_latent = None,
     joint_pass= False,
+    img_msk_latents = None,
+    img_msk_rebuilt = None,
+    denoising_strength = 1,
 ):
 
     kwargs = {'pipeline': pipeline, 'callback': callback, "img_len" : img.shape[1], "siglip_embedding": siglip_embedding, "siglip_embedding_ids": siglip_embedding_ids}
@@ -271,6 +291,21 @@ def denoise(
     if callback != None:
         callback(-1, None, True)
 
+    original_image_latents = None if img_cond_seq is None else img_cond_seq.clone() 
+
+    morph, first_step = False, 0
+    if img_msk_latents is not None:
+        randn = torch.randn_like(original_image_latents)
+        if denoising_strength < 1.:
+            first_step = int(len(timesteps) * (1. - denoising_strength))
+        if not morph:
+            latent_noise_factor = timesteps[first_step]
+            latents  = original_image_latents  * (1.0 - latent_noise_factor) + randn * latent_noise_factor
+            img = latents.to(img)
+            latents = None
+            timesteps = timesteps[first_step:]
+
+
     updated_num_steps= len(timesteps) -1
     if callback != None:
         from shared.utils.loras_mutipliers import update_loras_slists
@@ -280,10 +315,14 @@ def denoise(
     # this is ignored for schnell
     guidance_vec = torch.full((img.shape[0],), guidance, device=img.device, dtype=img.dtype)
     for i, (t_curr, t_prev) in enumerate(zip(timesteps[:-1], timesteps[1:])):
-        offload.set_step_no_for_lora(model, i)
+        offload.set_step_no_for_lora(model, first_step  + i)
         if pipeline._interrupt:
             return None
 
+        if img_msk_latents is not None and denoising_strength <1. and i == first_step and morph:
+            latent_noise_factor = t_curr/1000
+            img  = original_image_latents  * (1.0 - latent_noise_factor) + img * latent_noise_factor 
+
         t_vec = torch.full((img.shape[0],), t_curr, dtype=img.dtype, device=img.device)
         img_input = img
         img_input_ids = img_ids
@@ -333,6 +372,14 @@ def denoise(
             pred = neg_pred + real_guidance_scale * (pred - neg_pred)
 
         img += (t_prev - t_curr) * pred
+
+        if img_msk_latents is not None:
+            latent_noise_factor = t_prev
+            # noisy_image  = original_image_latents  * (1.0 - latent_noise_factor) + torch.randn_like(original_image_latents) * latent_noise_factor 
+            noisy_image  = original_image_latents  * (1.0 - latent_noise_factor) + randn * latent_noise_factor 
+            img  =  noisy_image * (1-img_msk_latents)  + img_msk_latents * img
+            noisy_image = None
+
         if callback is not None:
             preview = unpack_latent(img).transpose(0,1)
             callback(i, preview, False)         
diff --git a/models/flux/util.py b/models/flux/util.py
index 0f96103..af75f62 100644
--- a/models/flux/util.py
+++ b/models/flux/util.py
@@ -640,6 +640,38 @@ configs = {
             shift_factor=0.1159,
         ),
     ),
+    "flux-dev-umo": ModelSpec(
+        repo_id="",
+        repo_flow="",
+        repo_ae="ckpts/flux_vae.safetensors",
+        params=FluxParams(
+            in_channels=64,
+            out_channels=64,
+            vec_in_dim=768,
+            context_in_dim=4096,
+            hidden_size=3072,
+            mlp_ratio=4.0,
+            num_heads=24,
+            depth=19,
+            depth_single_blocks=38,
+            axes_dim=[16, 56, 56],
+            theta=10_000,
+            qkv_bias=True,
+            guidance_embed=True,
+            eso= True,
+        ),
+        ae_params=AutoEncoderParams(
+            resolution=256,
+            in_channels=3,
+            ch=128,
+            out_ch=3,
+            ch_mult=[1, 2, 4, 4],
+            num_res_blocks=2,
+            z_channels=16,
+            scale_factor=0.3611,
+            shift_factor=0.1159,
+        ),
+    ),
 }
 
 
diff --git a/models/qwen/pipeline_qwenimage.py b/models/qwen/pipeline_qwenimage.py
index 20838f5..0897ee4 100644
--- a/models/qwen/pipeline_qwenimage.py
+++ b/models/qwen/pipeline_qwenimage.py
@@ -714,14 +714,14 @@ class QwenImagePipeline(): #DiffusionPipeline
                 image_mask_latents = convert_image_to_tensor(image_mask.resize((width // 16, height // 16), resample=Image.Resampling.LANCZOS))
                 image_mask_latents = torch.where(image_mask_latents>-0.5, 1., 0. )[0:1]
                 image_mask_rebuilt = image_mask_latents.repeat_interleave(16, dim=-1).repeat_interleave(16, dim=-2).unsqueeze(0)
-                convert_tensor_to_image( image_mask_rebuilt.squeeze(0).repeat(3,1,1)).save("mmm.png")
+                # convert_tensor_to_image( image_mask_rebuilt.squeeze(0).repeat(3,1,1)).save("mmm.png")
                 image_mask_latents = image_mask_latents.reshape(1, -1, 1).to(device)
 
             prompt_image = image
             if image.size != (image_width, image_height):
                 image = image.resize((image_width, image_height), resample=Image.Resampling.LANCZOS)
 
-            image.save("nnn.png")
+            # image.save("nnn.png")
             image = convert_image_to_tensor(image).unsqueeze(0).unsqueeze(2)
 
         has_neg_prompt = negative_prompt is not None or (
@@ -811,12 +811,15 @@ class QwenImagePipeline(): #DiffusionPipeline
         negative_txt_seq_lens = (
             negative_prompt_embeds_mask.sum(dim=1).tolist() if negative_prompt_embeds_mask is not None else None
         )
-        morph = False
-        if image_mask_latents is not None and denoising_strength <= 1.:
-            first_step = int(len(timesteps) * (1. - denoising_strength))
+        morph, first_step = False, 0
+        if image_mask_latents is not None:
+            randn = torch.randn_like(original_image_latents)
+            if denoising_strength < 1.:
+                first_step = int(len(timesteps) * (1. - denoising_strength))
             if not morph:
                 latent_noise_factor = timesteps[first_step]/1000
-                latents  = original_image_latents  * (1.0 - latent_noise_factor) + torch.randn_like(original_image_latents) * latent_noise_factor 
+                # latents  = original_image_latents  * (1.0 - latent_noise_factor) + torch.randn_like(original_image_latents) * latent_noise_factor 
+                latents  = original_image_latents  * (1.0 - latent_noise_factor) + randn * latent_noise_factor 
                 timesteps = timesteps[first_step:]
                 self.scheduler.timesteps = timesteps
                 self.scheduler.sigmas= self.scheduler.sigmas[first_step:]
@@ -831,6 +834,7 @@ class QwenImagePipeline(): #DiffusionPipeline
 
 
         for i, t in enumerate(timesteps):
+            offload.set_step_no_for_lora(self.transformer, first_step  + i)
             if self.interrupt:
                 continue
 
@@ -905,7 +909,8 @@ class QwenImagePipeline(): #DiffusionPipeline
             if image_mask_latents is not None:
                 next_t = timesteps[i+1] if i<len(timesteps)-1 else 0
                 latent_noise_factor = next_t / 1000
-                noisy_image  = original_image_latents  * (1.0 - latent_noise_factor) + torch.randn_like(original_image_latents) * latent_noise_factor 
+                    # noisy_image  = original_image_latents  * (1.0 - latent_noise_factor) + torch.randn_like(original_image_latents) * latent_noise_factor 
+                noisy_image  = original_image_latents  * (1.0 - latent_noise_factor) + randn * latent_noise_factor 
                 latents  =  noisy_image * (1-image_mask_latents)  + image_mask_latents * latents
                 noisy_image = None
 
diff --git a/models/qwen/qwen_handler.py b/models/qwen/qwen_handler.py
index 7a8bda1..80a909f 100644
--- a/models/qwen/qwen_handler.py
+++ b/models/qwen/qwen_handler.py
@@ -28,6 +28,7 @@ class family_handler():
                 ],
             "letters_filter": "KI",
             }
+            extra_model_def["background_removal_label"]= "Remove Backgrounds only behind People / Objects except main Subject / Landscape" 
 
         return extra_model_def
 
diff --git a/models/wan/any2video.py b/models/wan/any2video.py
index fc38ea0..40ed42f 100644
--- a/models/wan/any2video.py
+++ b/models/wan/any2video.py
@@ -846,7 +846,7 @@ class WanAny2V:
         for i, t in enumerate(tqdm(timesteps)):
             guide_scale, guidance_switch_done, trans, denoising_extra = update_guidance(i, t, guide_scale, guide2_scale, guidance_switch_done, switch_threshold, trans, 2, denoising_extra)
             guide_scale, guidance_switch2_done, trans, denoising_extra = update_guidance(i, t, guide_scale, guide3_scale, guidance_switch2_done, switch2_threshold, trans, 3, denoising_extra)
-            offload.set_step_no_for_lora(trans, i)
+            offload.set_step_no_for_lora(trans, start_step_no + i)
             timestep = torch.stack([t])
 
             if timestep_injection:
diff --git a/models/wan/wan_handler.py b/models/wan/wan_handler.py
index 5f6a3e4..253bd92 100644
--- a/models/wan/wan_handler.py
+++ b/models/wan/wan_handler.py
@@ -165,6 +165,7 @@ class family_handler():
             }
 
             extra_model_def["lock_image_refs_ratios"] = True
+            extra_model_def["background_removal_label"]= "Remove Backgrounds behind People / Objects, keep it for Landscape or positioned Frames"
 
         if base_model_type in ["standin"]: 
             extra_model_def["lock_image_refs_ratios"] = True
diff --git a/wgp.py b/wgp.py
index 76425f6..d085be7 100644
--- a/wgp.py
+++ b/wgp.py
@@ -61,8 +61,8 @@ AUTOSAVE_FILENAME = "queue.zip"
 PROMPT_VARS_MAX = 10
 
 target_mmgp_version = "3.6.0"
-WanGP_version = "8.5"
-settings_version = 2.33
+WanGP_version = "8.55"
+settings_version = 2.34
 max_source_video_frames = 3000
 prompt_enhancer_image_caption_model, prompt_enhancer_image_caption_processor, prompt_enhancer_llm_model, prompt_enhancer_llm_tokenizer = None, None, None, None
 
@@ -487,7 +487,6 @@ def process_prompt_and_add_tasks(state, model_choice):
             image_mask = None
 
         if "G" in video_prompt_type:
-            if image_mode == 0:
                 gr.Info(f"With Denoising Strength {denoising_strength:.1f}, denoising will start at Step no {int(num_inference_steps * (1. - denoising_strength))} ")
         else: 
             denoising_strength = 1.0
@@ -552,11 +551,13 @@ def process_prompt_and_add_tasks(state, model_choice):
 
     if test_any_sliding_window(model_type) and image_mode == 0:
         if video_length > sliding_window_size:
+            if model_type in ["t2v"] and not "G" in video_prompt_type :
+                gr.Info(f"You have requested to Generate Sliding Windows with a Text to Video model. Unless you use the Video to Video feature this is useless as a t2v model doesn't see past frames and it will generate the same video in each new window.") 
+                return
             full_video_length = video_length if video_source is None else video_length +  sliding_window_overlap -1
             extra = "" if full_video_length == video_length else f" including {sliding_window_overlap} added for Video Continuation"
             no_windows = compute_sliding_window_no(full_video_length, sliding_window_size, sliding_window_discard_last_frames, sliding_window_overlap)
             gr.Info(f"The Number of Frames to generate ({video_length}{extra}) is greater than the Sliding Window Size ({sliding_window_size}), {no_windows} Windows will be generated")
-
     if "recam" in model_filename:
         if video_guide == None:
             gr.Info("You must provide a Control Video")
@@ -7019,28 +7020,38 @@ def categorize_resolution(resolution_str):
             return group
     return "1440p"
 
-def group_resolutions(resolutions, selected_resolution):
+def group_resolutions(model_def, resolutions, selected_resolution):
+
+    model_resolutions = model_def.get("resolutions", None)
+    if model_resolutions is not None:
+        selected_group ="Locked"
+        available_groups = [selected_group ]
+        selected_group_resolutions = model_resolutions
+    else:
+        grouped_resolutions = {}
+        for resolution in resolutions:
+            group = categorize_resolution(resolution[1])
+            if group not in grouped_resolutions:
+                grouped_resolutions[group] = []
+            grouped_resolutions[group].append(resolution)
+        
+        available_groups = [group for group in group_thresholds if group in grouped_resolutions]
     
-    grouped_resolutions = {}
-    for resolution in resolutions:
-        group = categorize_resolution(resolution[1])
-        if group not in grouped_resolutions:
-            grouped_resolutions[group] = []
-        grouped_resolutions[group].append(resolution)
-    
-    available_groups = [group for group in group_thresholds if group in grouped_resolutions]
-    
-    selected_group = categorize_resolution(selected_resolution)
-    selected_group_resolutions = grouped_resolutions.get(selected_group, [])
-    available_groups.reverse()
+        selected_group = categorize_resolution(selected_resolution)
+        selected_group_resolutions = grouped_resolutions.get(selected_group, [])
+        available_groups.reverse()
     return available_groups, selected_group_resolutions, selected_group
 
 def change_resolution_group(state, selected_group):
     model_type = state["model_type"]
     model_def = get_model_def(model_type)
     model_resolutions = model_def.get("resolutions", None)
-    resolution_choices, _ = get_resolution_choices(None, model_resolutions)    
-    group_resolution_choices = [ resolution for resolution in resolution_choices if categorize_resolution(resolution[1]) == selected_group ]
+    resolution_choices, _ = get_resolution_choices(None, model_resolutions)   
+    if model_resolutions is None:
+        group_resolution_choices = [ resolution for resolution in resolution_choices if categorize_resolution(resolution[1]) == selected_group ]
+    else:
+        last_resolution = group_resolution_choices[0][1]
+        return gr.update(choices= group_resolution_choices, value= last_resolution) 
 
     last_resolution_per_group = state["last_resolution_per_group"]
     last_resolution = last_resolution_per_group.get(selected_group, "")
@@ -7051,6 +7062,11 @@ def change_resolution_group(state, selected_group):
 
 
 def record_last_resolution(state, resolution):
+
+    model_type = state["model_type"]
+    model_def = get_model_def(model_type)
+    model_resolutions = model_def.get("resolutions", None)
+    if model_resolutions is not None: return
     server_config["last_resolution_choice"] = resolution
     selected_group = categorize_resolution(resolution)
     last_resolution_per_group = state["last_resolution_per_group"]
@@ -7482,11 +7498,13 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non
                 frames_positions = gr.Text(value=ui_defaults.get("frames_positions","") , visible= "F" in video_prompt_type_value, scale = 2, label= "Positions of Injected Frames separated by Spaces (1=first, no position for Objects / People)" ) 
                 image_refs_relative_size = gr.Slider(20, 100, value=ui_defaults.get("image_refs_relative_size", 50), step=1, label="Rescale Internaly Image Ref (% in relation to Output Video) to change Output Composition", visible = model_def.get("any_image_refs_relative_size", False) and image_outputs)
 
-                no_background_removal = model_def.get("no_background_removal", False)
+                no_background_removal = model_def.get("no_background_removal", False) or image_ref_choices is None
+                background_removal_label = model_def.get("background_removal_label", "Remove Backgrounds behind People / Objects") 
+ 
                 remove_background_images_ref = gr.Dropdown(
                     choices=[
                         ("Keep Backgrounds behind all Reference Images", 0),
-                        ("Remove Backgrounds only behind People / Objects except main Subject / Landscape" if (flux or qwen) else ("Remove Backgrounds behind People / Objects, keep it for Landscape or positioned Frames" if vace else "Remove Backgrounds behind People / Objects") , 1),
+                        (background_removal_label, 1),
                     ],
                     value=0 if no_background_removal else ui_defaults.get("remove_background_images_ref",1),
                     label="Automatic Removal of Background of People or Objects (Only)", scale = 3, visible= "I" in video_prompt_type_value and not no_background_removal
@@ -7578,7 +7596,7 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non
                 current_resolution_choice = ui_defaults.get("resolution","832x480") if update_form or last_resolution is None else last_resolution
                 model_resolutions = model_def.get("resolutions", None)
                 resolution_choices, current_resolution_choice = get_resolution_choices(current_resolution_choice, model_resolutions)
-                available_groups, selected_group_resolutions, selected_group = group_resolutions(resolution_choices, current_resolution_choice)
+                available_groups, selected_group_resolutions, selected_group = group_resolutions(model_def,resolution_choices, current_resolution_choice)
                 resolution_group = gr.Dropdown(
                 choices = available_groups,
                     value= selected_group,