From 9fa267087b2dfdba651fd173325537f031edf91d Mon Sep 17 00:00:00 2001
From: DeepBeepMeep <deepbeepmeep@yahoo.com>
Date: Thu, 11 Sep 2025 21:23:05 +0200
Subject: [PATCH] Flux Festival

---
 README.md                         |  9 +++-
 defaults/flux_dev_umo.json        | 24 ++++++++++
 defaults/flux_dev_uso.json        |  2 +-
 defaults/flux_srpo.json           | 15 ++++++
 defaults/flux_srpo_uso.json       | 17 +++++++
 models/flux/flux_handler.py       | 22 +++++++++
 models/flux/flux_main.py          | 77 +++++++++++++++++++++++++------
 models/flux/model.py              | 15 ++++++
 models/flux/sampling.py           | 57 +++++++++++++++++++++--
 models/flux/util.py               | 32 +++++++++++++
 models/qwen/pipeline_qwenimage.py | 19 +++++---
 models/qwen/qwen_handler.py       |  1 +
 models/wan/any2video.py           |  2 +-
 models/wan/wan_handler.py         |  1 +
 wgp.py                            | 62 ++++++++++++++++---------
 15 files changed, 305 insertions(+), 50 deletions(-)
 create mode 100644 defaults/flux_dev_umo.json
 create mode 100644 defaults/flux_srpo.json
 create mode 100644 defaults/flux_srpo_uso.json

diff --git a/README.md b/README.md
index fc3d76c..d33b6dc 100644
--- a/README.md
+++ b/README.md
@@ -20,7 +20,7 @@ WanGP supports the Wan (and derived models), Hunyuan Video and LTV Video models
 **Follow DeepBeepMeep on Twitter/X to get the Latest News**: https://x.com/deepbeepmeep
 
 ## 🔥 Latest Updates : 
-### September 5 2025: WanGP v8.5 - Wanna be a Cropper or a Painter ?
+### September 11 2025: WanGP v8.5/8.55 - Wanna be a Cropper or a Painter ?
 
 I have done some intensive internal refactoring of the generation pipeline to ease support of existing models or add new models. Nothing really visible but this makes WanGP is little more future proof.
 
@@ -38,6 +38,13 @@ Doing more sophisticated thing Vace Image Editor works very well too: try Image
 
 For the best quality I recommend to set in *Quality Tab* the option: "*Generate a 9 Frames Long video...*" 
 
+**update 8.55**: Flux Festival
+- **Inpainting Mode** also added for *Flux Kontext*
+- **Flux SRPO** : new finetune with x3 better quality vs Flux Dev according to its authors. I have also created a *Flux SRPO USO* finetune which is certainly the best open source *Style Transfer* tool available
+- **Flux UMO**: model specialized in combining multiple reference objects / people together. Works quite well at 768x768
+
+Good luck with finding your way through all the Flux models names !
+
 ### September 5 2025: WanGP v8.4 - Take me to Outer Space
 You have probably seen these short AI generated movies created using *Nano Banana* and the *First Frame - Last Frame* feature of *Kling 2.0*. The idea is to generate an image, modify a part of it with Nano Banana and give the these two images to Kling that will generate the Video between these two images, use now the previous Last Frame as the new First Frame, rinse and repeat and you get a full movie.
 
diff --git a/defaults/flux_dev_umo.json b/defaults/flux_dev_umo.json
new file mode 100644
index 0000000..57164bb
--- /dev/null
+++ b/defaults/flux_dev_umo.json
@@ -0,0 +1,24 @@
+{
+    "model": {
+        "name": "Flux 1 Dev UMO 12B",
+        "architecture": "flux",
+        "description": "FLUX.1 Dev UMO is a model that can Edit Images with a specialization in combining multiple image references (resized internally at 512x512 max) to produce an Image output. Best Image preservation at 768x768 Resolution Output.",
+        "URLs": "flux",
+		"flux-model": "flux-dev-umo",		
+		"loras": ["https://huggingface.co/DeepBeepMeep/Flux/resolve/main/flux1-dev-UMO_dit_lora_bf16.safetensors"],
+		"resolutions":  [ ["1024x1024 (1:1)", "1024x1024"],
+						["768x1024 (3:4)", "768x1024"],
+						["1024x768 (4:3)", "1024x768"],
+						["512x1024 (1:2)", "512x1024"],
+						["1024x512 (2:1)", "1024x512"],
+						["768x768 (1:1)", "768x768"],
+						["768x512 (3:2)", "768x512"],
+						["512x768 (2:3)", "512x768"]]
+    },	
+	"prompt": "the man is wearing a hat",
+	"embedded_guidance_scale": 4,
+    "resolution": "768x768",
+    "batch_size": 1
+}
+
+	
\ No newline at end of file
diff --git a/defaults/flux_dev_uso.json b/defaults/flux_dev_uso.json
index 0cd7b82..806dd7e 100644
--- a/defaults/flux_dev_uso.json
+++ b/defaults/flux_dev_uso.json
@@ -2,7 +2,7 @@
     "model": {
         "name": "Flux 1 Dev USO 12B",
         "architecture": "flux",
-        "description": "FLUX.1 Dev USO is a model specialized to Edit Images with a specialization in Style Transfers (up to two).",
+        "description": "FLUX.1 Dev USO is a model that can Edit Images with a specialization in Style Transfers (up to two).",
 		"modules": [ ["https://huggingface.co/DeepBeepMeep/Flux/resolve/main/flux1-dev-USO_projector_bf16.safetensors"]],
         "URLs": "flux",
 		"loras": ["https://huggingface.co/DeepBeepMeep/Flux/resolve/main/flux1-dev-USO_dit_lora_bf16.safetensors"],
diff --git a/defaults/flux_srpo.json b/defaults/flux_srpo.json
new file mode 100644
index 0000000..59f07c6
--- /dev/null
+++ b/defaults/flux_srpo.json
@@ -0,0 +1,15 @@
+{
+    "model": {
+        "name": "Flux 1 SRPO Dev 12B",
+        "architecture": "flux",
+        "description": "By fine-tuning the FLUX.1.dev model with optimized denoising and online reward adjustment, SRPO improves its human-evaluated realism and aesthetic quality by over 3x.",
+        "URLs": [
+            "https://huggingface.co/DeepBeepMeep/Flux/resolve/main/flux1-srpo-dev_bf16.safetensors",
+            "https://huggingface.co/DeepBeepMeep/Flux/resolve/main/flux1-srpo-dev_quanto_bf16_int8.safetensors"
+        ],
+        "flux-model": "flux-dev"
+    },
+    "prompt": "draw a hat",
+    "resolution": "1024x1024",
+    "batch_size": 1
+}
\ No newline at end of file
diff --git a/defaults/flux_srpo_uso.json b/defaults/flux_srpo_uso.json
new file mode 100644
index 0000000..ddfe50d
--- /dev/null
+++ b/defaults/flux_srpo_uso.json
@@ -0,0 +1,17 @@
+{
+    "model": {
+        "name": "Flux 1 SRPO USO 12B",
+        "architecture": "flux",
+        "description": "FLUX.1 SRPO USO is a model that can Edit Images with a specialization in Style Transfers (up to two). It leverages the improved Image quality brought by the SRPO process",
+		"modules": [ "flux_dev_uso"],
+        "URLs": "flux_srpo",
+		"loras": "flux_dev_uso",
+		"flux-model": "flux-dev-uso"		
+    },
+	"prompt": "the man is wearing a hat",
+	"embedded_guidance_scale": 4,
+    "resolution": "1024x1024",
+    "batch_size": 1
+}
+
+	
\ No newline at end of file
diff --git a/models/flux/flux_handler.py b/models/flux/flux_handler.py
index c468d5a..808369f 100644
--- a/models/flux/flux_handler.py
+++ b/models/flux/flux_handler.py
@@ -13,6 +13,7 @@ class family_handler():
         flux_schnell = flux_model == "flux-schnell" 
         flux_chroma = flux_model == "flux-chroma" 
         flux_uso = flux_model == "flux-dev-uso"
+        flux_umo = flux_model == "flux-dev-umo"
         flux_kontext = flux_model == "flux-dev-kontext"
         
         extra_model_def = {
@@ -35,6 +36,7 @@ class family_handler():
             }
         
         if flux_kontext:
+            extra_model_def["inpaint_support"] = True
             extra_model_def["image_ref_choices"] = {
                 "choices": [
                     ("None", ""),
@@ -43,6 +45,15 @@ class family_handler():
                     ],
                 "letters_filter": "KI",
             }
+            extra_model_def["background_removal_label"]= "Remove Backgrounds only behind People / Objects except main Subject / Landscape" 
+        elif flux_umo:
+            extra_model_def["image_ref_choices"] = {
+                "choices": [
+                    ("Conditional Images are People / Objects", "I"),
+                    ],
+                "letters_filter": "I",
+                "visible": False
+            }
 
 
         extra_model_def["lock_image_refs_ratios"] = True
@@ -131,10 +142,14 @@ class family_handler():
                 video_prompt_type = video_prompt_type.replace("I", "KI")
                 ui_defaults["video_prompt_type"] = video_prompt_type 
 
+        if settings_version < 2.34:
+            ui_defaults["denoising_strength"] = 1.
+
     @staticmethod
     def update_default_settings(base_model_type, model_def, ui_defaults):
         flux_model = model_def.get("flux-model", "flux-dev")
         flux_uso = flux_model == "flux-dev-uso"
+        flux_umo = flux_model == "flux-dev-umo"
         flux_kontext = flux_model == "flux-dev-kontext"
         ui_defaults.update({
             "embedded_guidance":  2.5,
@@ -143,5 +158,12 @@ class family_handler():
         if flux_kontext or flux_uso:
             ui_defaults.update({
                 "video_prompt_type": "KI",
+                "denoising_strength": 1.,
             })
+        elif flux_umo:
+            ui_defaults.update({
+                "video_prompt_type": "I",
+                "remove_background_images_ref": 0,
+            })
+        
 
diff --git a/models/flux/flux_main.py b/models/flux/flux_main.py
index 4d7c67d..6863711 100644
--- a/models/flux/flux_main.py
+++ b/models/flux/flux_main.py
@@ -23,6 +23,35 @@ from .util import (
 )
 
 from PIL import Image
+def preprocess_ref(raw_image: Image.Image, long_size: int = 512):
+    # 获取原始图像的宽度和高度
+    image_w, image_h = raw_image.size
+
+    # 计算长边和短边
+    if image_w >= image_h:
+        new_w = long_size
+        new_h = int((long_size / image_w) * image_h)
+    else:
+        new_h = long_size
+        new_w = int((long_size / image_h) * image_w)
+
+    # 按新的宽高进行等比例缩放
+    raw_image = raw_image.resize((new_w, new_h), resample=Image.LANCZOS)
+    target_w = new_w // 16 * 16
+    target_h = new_h // 16 * 16
+
+    # 计算裁剪的起始坐标以实现中心裁剪
+    left = (new_w - target_w) // 2
+    top = (new_h - target_h) // 2
+    right = left + target_w
+    bottom = top + target_h
+
+    # 进行中心裁剪
+    raw_image = raw_image.crop((left, top, right, bottom))
+
+    # 转换为 RGB 模式
+    raw_image = raw_image.convert("RGB")
+    return raw_image
 
 def stitch_images(img1, img2):
     # Resize img2 to match img1's height
@@ -67,7 +96,7 @@ class model_factory:
         # self.name= "flux-schnell"
         source =  model_def.get("source", None)
         self.model = load_flow_model(self.name, model_filename[0] if source is None else source, torch_device)
-
+        self.model_def = model_def 
         self.vae = load_ae(self.name, device=torch_device)
 
         siglip_processor = siglip_model = feature_embedder = None
@@ -109,10 +138,12 @@ class model_factory:
     def generate(
             self,
             seed: int | None = None,
-            input_prompt: str = "replace the logo with the text 'Black Forest Labs'",
+            input_prompt: str = "replace the logo with the text 'Black Forest Labs'",            
             n_prompt: str = None,
             sampling_steps: int = 20,
             input_ref_images = None,
+            image_guide= None,
+            image_mask= None,
             width= 832,
             height=480,
             embedded_guidance_scale: float = 2.5,
@@ -123,7 +154,8 @@ class model_factory:
             batch_size = 1,
             video_prompt_type = "",
             joint_pass = False,
-            image_refs_relative_size = 100,       
+            image_refs_relative_size = 100,
+            denoising_strength = 1.,
             **bbargs
     ):
             if self._interrupt:
@@ -132,8 +164,16 @@ class model_factory:
             if n_prompt is None or len(n_prompt) == 0: n_prompt = "low quality, ugly, unfinished, out of focus, deformed, disfigure, blurry, smudged, restricted palette, flat colors"
             device="cuda"
             flux_dev_uso = self.name in ['flux-dev-uso']
-            image_stiching =  not self.name in ['flux-dev-uso'] #and False
+            flux_dev_umo = self.name in ['flux-dev-umo']
+            latent_stiching =  self.name in ['flux-dev-uso', 'flux-dev-umo'] 
+
+            lock_dimensions=  False
+
             input_ref_images = [] if input_ref_images is None else input_ref_images[:]
+            if flux_dev_umo:
+                ref_long_side = 512 if len(input_ref_images) <= 1 else 320
+                input_ref_images = [preprocess_ref(img, ref_long_side) for img in input_ref_images]
+                lock_dimensions = True
             ref_style_imgs = []
             if "I" in video_prompt_type and len(input_ref_images) > 0: 
                 if flux_dev_uso :
@@ -143,22 +183,26 @@ class model_factory:
                     elif len(input_ref_images) > 1 :
                         ref_style_imgs = input_ref_images[-1:]
                         input_ref_images = input_ref_images[:-1]
-                if image_stiching:
+
+                if latent_stiching:
+                    # latents stiching with resize 
+                    if not lock_dimensions :
+                        for i in range(len(input_ref_images)):
+                            w, h = input_ref_images[i].size
+                            image_height, image_width = calculate_new_dimensions(int(height*image_refs_relative_size/100), int(width*image_refs_relative_size/100), h, w, 0)
+                            input_ref_images[i] = input_ref_images[i].resize((image_width, image_height), resample=Image.Resampling.LANCZOS) 
+                else:
                     # image stiching method
                     stiched = input_ref_images[0]
                     for new_img in input_ref_images[1:]:
                         stiched = stitch_images(stiched, new_img)
                     input_ref_images  = [stiched]
-                else:
-                    # latents stiching with resize 
-                    for i in range(len(input_ref_images)):
-                        w, h = input_ref_images[i].size
-                        image_height, image_width = calculate_new_dimensions(int(height*image_refs_relative_size/100), int(width*image_refs_relative_size/100), h, w, fit_into_canvas)
-                        input_ref_images[i] = input_ref_images[i].resize((image_width, image_height), resample=Image.Resampling.LANCZOS) 
+            elif image_guide is not None:
+                input_ref_images = [image_guide] 
             else:
                 input_ref_images = None
 
-            if flux_dev_uso :
+            if self.name in ['flux-dev-uso', 'flux-dev-umo']  :
                 inp, height, width = prepare_multi_ip(
                     ae=self.vae,
                     img_cond_list=input_ref_images,
@@ -177,6 +221,7 @@ class model_factory:
                     bs=batch_size,
                     seed=seed,
                     device=device,
+                    img_mask=image_mask,
                 )
 
             inp.update(prepare_prompt(self.t5, self.clip, batch_size, input_prompt))
@@ -198,13 +243,19 @@ class model_factory:
                 return unpack(x.float(), height, width) 
 
             # denoise initial noise
-            x = denoise(self.model, **inp, timesteps=timesteps, guidance=embedded_guidance_scale, real_guidance_scale =guide_scale, callback=callback, pipeline=self, loras_slists= loras_slists, unpack_latent = unpack_latent, joint_pass = joint_pass)
+            x = denoise(self.model, **inp, timesteps=timesteps, guidance=embedded_guidance_scale, real_guidance_scale =guide_scale, callback=callback, pipeline=self, loras_slists= loras_slists, unpack_latent = unpack_latent, joint_pass = joint_pass, denoising_strength = denoising_strength)
             if x==None: return None
             # decode latents to pixel space
             x = unpack_latent(x)
             with torch.autocast(device_type=device, dtype=torch.bfloat16):
                 x = self.vae.decode(x)
 
+            if image_mask is not None:
+                from shared.utils.utils import convert_image_to_tensor
+                img_msk_rebuilt = inp["img_msk_rebuilt"]
+                img= convert_image_to_tensor(image_guide) 
+                x = img.squeeze(2) * (1 - img_msk_rebuilt) + x.to(img) * img_msk_rebuilt 
+
             x = x.clamp(-1, 1)
             x = x.transpose(0, 1)
             return x
diff --git a/models/flux/model.py b/models/flux/model.py
index c4642d0..c5f7a24 100644
--- a/models/flux/model.py
+++ b/models/flux/model.py
@@ -190,6 +190,21 @@ class Flux(nn.Module):
                             v = swap_scale_shift(v)
                         k = k.replace("norm_out.linear", "final_layer.adaLN_modulation.1")            
                 new_sd[k] = v
+        # elif not first_key.startswith("diffusion_model.") and not first_key.startswith("transformer."):
+        #     for k,v in sd.items():
+        #         if "double" in k:
+        #             k = k.replace(".processor.proj_lora1.", ".img_attn.proj.lora_")
+        #             k = k.replace(".processor.proj_lora2.", ".txt_attn.proj.lora_")
+        #             k = k.replace(".processor.qkv_lora1.", ".img_attn.qkv.lora_")
+        #             k = k.replace(".processor.qkv_lora2.", ".txt_attn.qkv.lora_")
+        #         else:
+        #             k = k.replace(".processor.qkv_lora.", ".linear1_qkv.lora_")
+        #             k = k.replace(".processor.proj_lora.", ".linear2.lora_")
+
+        #         k = "diffusion_model." + k
+        #         new_sd[k] = v
+        #     from mmgp import safetensors2
+        #     safetensors2.torch_write_file(new_sd, "fff.safetensors")
         else:
             new_sd = sd
         return new_sd    
diff --git a/models/flux/sampling.py b/models/flux/sampling.py
index f43ae15..1b4813a 100644
--- a/models/flux/sampling.py
+++ b/models/flux/sampling.py
@@ -138,10 +138,12 @@ def prepare_kontext(
     target_width: int | None = None,
     target_height: int | None = None,
     bs: int = 1,
-
+    img_mask = None,
 ) -> tuple[dict[str, Tensor], int, int]:
     # load and encode the conditioning image
 
+    res_match_output = img_mask is not None
+
     img_cond_seq = None
     img_cond_seq_ids = None
     if img_cond_list == None: img_cond_list = []
@@ -150,9 +152,11 @@ def prepare_kontext(
     for cond_no, img_cond in enumerate(img_cond_list): 
         width, height = img_cond.size
         aspect_ratio = width / height
-
-        # Kontext is trained on specific resolutions, using one of them is recommended
-        _, width, height = min((abs(aspect_ratio - w / h), w, h) for w, h in PREFERED_KONTEXT_RESOLUTIONS)
+        if res_match_output:
+            width, height = target_width, target_height
+        else:
+            # Kontext is trained on specific resolutions, using one of them is recommended
+            _, width, height = min((abs(aspect_ratio - w / h), w, h) for w, h in PREFERED_KONTEXT_RESOLUTIONS)
         width = 2 * int(width / 16)
         height = 2 * int(height / 16)
 
@@ -193,6 +197,19 @@ def prepare_kontext(
         "img_cond_seq": img_cond_seq,
         "img_cond_seq_ids": img_cond_seq_ids,
     }
+    if img_mask is not None:
+        from shared.utils.utils import convert_image_to_tensor, convert_tensor_to_image
+        # image_height, image_width = calculate_new_dimensions(ref_height, ref_width, image_height, image_width, False, block_size=multiple_of)
+        image_mask_latents = convert_image_to_tensor(img_mask.resize((target_width // 16, target_height // 16), resample=Image.Resampling.LANCZOS))
+        image_mask_latents = torch.where(image_mask_latents>-0.5, 1., 0. )[0:1]
+        image_mask_rebuilt = image_mask_latents.repeat_interleave(16, dim=-1).repeat_interleave(16, dim=-2).unsqueeze(0)
+        convert_tensor_to_image( image_mask_rebuilt.squeeze(0).repeat(3,1,1)).save("mmm.png")
+        image_mask_latents = image_mask_latents.reshape(1, -1, 1).to(device)        
+        return_dict.update({
+            "img_msk_latents": image_mask_latents,
+            "img_msk_rebuilt": image_mask_rebuilt,
+        })
+
     img = get_noise(
         bs,
         target_height,
@@ -264,6 +281,9 @@ def denoise(
     loras_slists=None,
     unpack_latent = None,
     joint_pass= False,
+    img_msk_latents = None,
+    img_msk_rebuilt = None,
+    denoising_strength = 1,
 ):
 
     kwargs = {'pipeline': pipeline, 'callback': callback, "img_len" : img.shape[1], "siglip_embedding": siglip_embedding, "siglip_embedding_ids": siglip_embedding_ids}
@@ -271,6 +291,21 @@ def denoise(
     if callback != None:
         callback(-1, None, True)
 
+    original_image_latents = None if img_cond_seq is None else img_cond_seq.clone() 
+
+    morph, first_step = False, 0
+    if img_msk_latents is not None:
+        randn = torch.randn_like(original_image_latents)
+        if denoising_strength < 1.:
+            first_step = int(len(timesteps) * (1. - denoising_strength))
+        if not morph:
+            latent_noise_factor = timesteps[first_step]
+            latents  = original_image_latents  * (1.0 - latent_noise_factor) + randn * latent_noise_factor
+            img = latents.to(img)
+            latents = None
+            timesteps = timesteps[first_step:]
+
+
     updated_num_steps= len(timesteps) -1
     if callback != None:
         from shared.utils.loras_mutipliers import update_loras_slists
@@ -280,10 +315,14 @@ def denoise(
     # this is ignored for schnell
     guidance_vec = torch.full((img.shape[0],), guidance, device=img.device, dtype=img.dtype)
     for i, (t_curr, t_prev) in enumerate(zip(timesteps[:-1], timesteps[1:])):
-        offload.set_step_no_for_lora(model, i)
+        offload.set_step_no_for_lora(model, first_step  + i)
         if pipeline._interrupt:
             return None
 
+        if img_msk_latents is not None and denoising_strength <1. and i == first_step and morph:
+            latent_noise_factor = t_curr/1000
+            img  = original_image_latents  * (1.0 - latent_noise_factor) + img * latent_noise_factor 
+
         t_vec = torch.full((img.shape[0],), t_curr, dtype=img.dtype, device=img.device)
         img_input = img
         img_input_ids = img_ids
@@ -333,6 +372,14 @@ def denoise(
             pred = neg_pred + real_guidance_scale * (pred - neg_pred)
 
         img += (t_prev - t_curr) * pred
+
+        if img_msk_latents is not None:
+            latent_noise_factor = t_prev
+            # noisy_image  = original_image_latents  * (1.0 - latent_noise_factor) + torch.randn_like(original_image_latents) * latent_noise_factor 
+            noisy_image  = original_image_latents  * (1.0 - latent_noise_factor) + randn * latent_noise_factor 
+            img  =  noisy_image * (1-img_msk_latents)  + img_msk_latents * img
+            noisy_image = None
+
         if callback is not None:
             preview = unpack_latent(img).transpose(0,1)
             callback(i, preview, False)         
diff --git a/models/flux/util.py b/models/flux/util.py
index 0f96103..af75f62 100644
--- a/models/flux/util.py
+++ b/models/flux/util.py
@@ -640,6 +640,38 @@ configs = {
             shift_factor=0.1159,
         ),
     ),
+    "flux-dev-umo": ModelSpec(
+        repo_id="",
+        repo_flow="",
+        repo_ae="ckpts/flux_vae.safetensors",
+        params=FluxParams(
+            in_channels=64,
+            out_channels=64,
+            vec_in_dim=768,
+            context_in_dim=4096,
+            hidden_size=3072,
+            mlp_ratio=4.0,
+            num_heads=24,
+            depth=19,
+            depth_single_blocks=38,
+            axes_dim=[16, 56, 56],
+            theta=10_000,
+            qkv_bias=True,
+            guidance_embed=True,
+            eso= True,
+        ),
+        ae_params=AutoEncoderParams(
+            resolution=256,
+            in_channels=3,
+            ch=128,
+            out_ch=3,
+            ch_mult=[1, 2, 4, 4],
+            num_res_blocks=2,
+            z_channels=16,
+            scale_factor=0.3611,
+            shift_factor=0.1159,
+        ),
+    ),
 }
 
 
diff --git a/models/qwen/pipeline_qwenimage.py b/models/qwen/pipeline_qwenimage.py
index 20838f5..0897ee4 100644
--- a/models/qwen/pipeline_qwenimage.py
+++ b/models/qwen/pipeline_qwenimage.py
@@ -714,14 +714,14 @@ class QwenImagePipeline(): #DiffusionPipeline
                 image_mask_latents = convert_image_to_tensor(image_mask.resize((width // 16, height // 16), resample=Image.Resampling.LANCZOS))
                 image_mask_latents = torch.where(image_mask_latents>-0.5, 1., 0. )[0:1]
                 image_mask_rebuilt = image_mask_latents.repeat_interleave(16, dim=-1).repeat_interleave(16, dim=-2).unsqueeze(0)
-                convert_tensor_to_image( image_mask_rebuilt.squeeze(0).repeat(3,1,1)).save("mmm.png")
+                # convert_tensor_to_image( image_mask_rebuilt.squeeze(0).repeat(3,1,1)).save("mmm.png")
                 image_mask_latents = image_mask_latents.reshape(1, -1, 1).to(device)
 
             prompt_image = image
             if image.size != (image_width, image_height):
                 image = image.resize((image_width, image_height), resample=Image.Resampling.LANCZOS)
 
-            image.save("nnn.png")
+            # image.save("nnn.png")
             image = convert_image_to_tensor(image).unsqueeze(0).unsqueeze(2)
 
         has_neg_prompt = negative_prompt is not None or (
@@ -811,12 +811,15 @@ class QwenImagePipeline(): #DiffusionPipeline
         negative_txt_seq_lens = (
             negative_prompt_embeds_mask.sum(dim=1).tolist() if negative_prompt_embeds_mask is not None else None
         )
-        morph = False
-        if image_mask_latents is not None and denoising_strength <= 1.:
-            first_step = int(len(timesteps) * (1. - denoising_strength))
+        morph, first_step = False, 0
+        if image_mask_latents is not None:
+            randn = torch.randn_like(original_image_latents)
+            if denoising_strength < 1.:
+                first_step = int(len(timesteps) * (1. - denoising_strength))
             if not morph:
                 latent_noise_factor = timesteps[first_step]/1000
-                latents  = original_image_latents  * (1.0 - latent_noise_factor) + torch.randn_like(original_image_latents) * latent_noise_factor 
+                # latents  = original_image_latents  * (1.0 - latent_noise_factor) + torch.randn_like(original_image_latents) * latent_noise_factor 
+                latents  = original_image_latents  * (1.0 - latent_noise_factor) + randn * latent_noise_factor 
                 timesteps = timesteps[first_step:]
                 self.scheduler.timesteps = timesteps
                 self.scheduler.sigmas= self.scheduler.sigmas[first_step:]
@@ -831,6 +834,7 @@ class QwenImagePipeline(): #DiffusionPipeline
 
 
         for i, t in enumerate(timesteps):
+            offload.set_step_no_for_lora(self.transformer, first_step  + i)
             if self.interrupt:
                 continue
 
@@ -905,7 +909,8 @@ class QwenImagePipeline(): #DiffusionPipeline
             if image_mask_latents is not None:
                 next_t = timesteps[i+1] if i<len(timesteps)-1 else 0
                 latent_noise_factor = next_t / 1000
-                noisy_image  = original_image_latents  * (1.0 - latent_noise_factor) + torch.randn_like(original_image_latents) * latent_noise_factor 
+                    # noisy_image  = original_image_latents  * (1.0 - latent_noise_factor) + torch.randn_like(original_image_latents) * latent_noise_factor 
+                noisy_image  = original_image_latents  * (1.0 - latent_noise_factor) + randn * latent_noise_factor 
                 latents  =  noisy_image * (1-image_mask_latents)  + image_mask_latents * latents
                 noisy_image = None
 
diff --git a/models/qwen/qwen_handler.py b/models/qwen/qwen_handler.py
index 7a8bda1..80a909f 100644
--- a/models/qwen/qwen_handler.py
+++ b/models/qwen/qwen_handler.py
@@ -28,6 +28,7 @@ class family_handler():
                 ],
             "letters_filter": "KI",
             }
+            extra_model_def["background_removal_label"]= "Remove Backgrounds only behind People / Objects except main Subject / Landscape" 
 
         return extra_model_def
 
diff --git a/models/wan/any2video.py b/models/wan/any2video.py
index fc38ea0..40ed42f 100644
--- a/models/wan/any2video.py
+++ b/models/wan/any2video.py
@@ -846,7 +846,7 @@ class WanAny2V:
         for i, t in enumerate(tqdm(timesteps)):
             guide_scale, guidance_switch_done, trans, denoising_extra = update_guidance(i, t, guide_scale, guide2_scale, guidance_switch_done, switch_threshold, trans, 2, denoising_extra)
             guide_scale, guidance_switch2_done, trans, denoising_extra = update_guidance(i, t, guide_scale, guide3_scale, guidance_switch2_done, switch2_threshold, trans, 3, denoising_extra)
-            offload.set_step_no_for_lora(trans, i)
+            offload.set_step_no_for_lora(trans, start_step_no + i)
             timestep = torch.stack([t])
 
             if timestep_injection:
diff --git a/models/wan/wan_handler.py b/models/wan/wan_handler.py
index 5f6a3e4..253bd92 100644
--- a/models/wan/wan_handler.py
+++ b/models/wan/wan_handler.py
@@ -165,6 +165,7 @@ class family_handler():
             }
 
             extra_model_def["lock_image_refs_ratios"] = True
+            extra_model_def["background_removal_label"]= "Remove Backgrounds behind People / Objects, keep it for Landscape or positioned Frames"
 
         if base_model_type in ["standin"]: 
             extra_model_def["lock_image_refs_ratios"] = True
diff --git a/wgp.py b/wgp.py
index 76425f6..d085be7 100644
--- a/wgp.py
+++ b/wgp.py
@@ -61,8 +61,8 @@ AUTOSAVE_FILENAME = "queue.zip"
 PROMPT_VARS_MAX = 10
 
 target_mmgp_version = "3.6.0"
-WanGP_version = "8.5"
-settings_version = 2.33
+WanGP_version = "8.55"
+settings_version = 2.34
 max_source_video_frames = 3000
 prompt_enhancer_image_caption_model, prompt_enhancer_image_caption_processor, prompt_enhancer_llm_model, prompt_enhancer_llm_tokenizer = None, None, None, None
 
@@ -487,7 +487,6 @@ def process_prompt_and_add_tasks(state, model_choice):
             image_mask = None
 
         if "G" in video_prompt_type:
-            if image_mode == 0:
                 gr.Info(f"With Denoising Strength {denoising_strength:.1f}, denoising will start at Step no {int(num_inference_steps * (1. - denoising_strength))} ")
         else: 
             denoising_strength = 1.0
@@ -552,11 +551,13 @@ def process_prompt_and_add_tasks(state, model_choice):
 
     if test_any_sliding_window(model_type) and image_mode == 0:
         if video_length > sliding_window_size:
+            if model_type in ["t2v"] and not "G" in video_prompt_type :
+                gr.Info(f"You have requested to Generate Sliding Windows with a Text to Video model. Unless you use the Video to Video feature this is useless as a t2v model doesn't see past frames and it will generate the same video in each new window.") 
+                return
             full_video_length = video_length if video_source is None else video_length +  sliding_window_overlap -1
             extra = "" if full_video_length == video_length else f" including {sliding_window_overlap} added for Video Continuation"
             no_windows = compute_sliding_window_no(full_video_length, sliding_window_size, sliding_window_discard_last_frames, sliding_window_overlap)
             gr.Info(f"The Number of Frames to generate ({video_length}{extra}) is greater than the Sliding Window Size ({sliding_window_size}), {no_windows} Windows will be generated")
-
     if "recam" in model_filename:
         if video_guide == None:
             gr.Info("You must provide a Control Video")
@@ -7019,28 +7020,38 @@ def categorize_resolution(resolution_str):
             return group
     return "1440p"
 
-def group_resolutions(resolutions, selected_resolution):
+def group_resolutions(model_def, resolutions, selected_resolution):
+
+    model_resolutions = model_def.get("resolutions", None)
+    if model_resolutions is not None:
+        selected_group ="Locked"
+        available_groups = [selected_group ]
+        selected_group_resolutions = model_resolutions
+    else:
+        grouped_resolutions = {}
+        for resolution in resolutions:
+            group = categorize_resolution(resolution[1])
+            if group not in grouped_resolutions:
+                grouped_resolutions[group] = []
+            grouped_resolutions[group].append(resolution)
+        
+        available_groups = [group for group in group_thresholds if group in grouped_resolutions]
     
-    grouped_resolutions = {}
-    for resolution in resolutions:
-        group = categorize_resolution(resolution[1])
-        if group not in grouped_resolutions:
-            grouped_resolutions[group] = []
-        grouped_resolutions[group].append(resolution)
-    
-    available_groups = [group for group in group_thresholds if group in grouped_resolutions]
-    
-    selected_group = categorize_resolution(selected_resolution)
-    selected_group_resolutions = grouped_resolutions.get(selected_group, [])
-    available_groups.reverse()
+        selected_group = categorize_resolution(selected_resolution)
+        selected_group_resolutions = grouped_resolutions.get(selected_group, [])
+        available_groups.reverse()
     return available_groups, selected_group_resolutions, selected_group
 
 def change_resolution_group(state, selected_group):
     model_type = state["model_type"]
     model_def = get_model_def(model_type)
     model_resolutions = model_def.get("resolutions", None)
-    resolution_choices, _ = get_resolution_choices(None, model_resolutions)    
-    group_resolution_choices = [ resolution for resolution in resolution_choices if categorize_resolution(resolution[1]) == selected_group ]
+    resolution_choices, _ = get_resolution_choices(None, model_resolutions)   
+    if model_resolutions is None:
+        group_resolution_choices = [ resolution for resolution in resolution_choices if categorize_resolution(resolution[1]) == selected_group ]
+    else:
+        last_resolution = group_resolution_choices[0][1]
+        return gr.update(choices= group_resolution_choices, value= last_resolution) 
 
     last_resolution_per_group = state["last_resolution_per_group"]
     last_resolution = last_resolution_per_group.get(selected_group, "")
@@ -7051,6 +7062,11 @@ def change_resolution_group(state, selected_group):
 
 
 def record_last_resolution(state, resolution):
+
+    model_type = state["model_type"]
+    model_def = get_model_def(model_type)
+    model_resolutions = model_def.get("resolutions", None)
+    if model_resolutions is not None: return
     server_config["last_resolution_choice"] = resolution
     selected_group = categorize_resolution(resolution)
     last_resolution_per_group = state["last_resolution_per_group"]
@@ -7482,11 +7498,13 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non
                 frames_positions = gr.Text(value=ui_defaults.get("frames_positions","") , visible= "F" in video_prompt_type_value, scale = 2, label= "Positions of Injected Frames separated by Spaces (1=first, no position for Objects / People)" ) 
                 image_refs_relative_size = gr.Slider(20, 100, value=ui_defaults.get("image_refs_relative_size", 50), step=1, label="Rescale Internaly Image Ref (% in relation to Output Video) to change Output Composition", visible = model_def.get("any_image_refs_relative_size", False) and image_outputs)
 
-                no_background_removal = model_def.get("no_background_removal", False)
+                no_background_removal = model_def.get("no_background_removal", False) or image_ref_choices is None
+                background_removal_label = model_def.get("background_removal_label", "Remove Backgrounds behind People / Objects") 
+ 
                 remove_background_images_ref = gr.Dropdown(
                     choices=[
                         ("Keep Backgrounds behind all Reference Images", 0),
-                        ("Remove Backgrounds only behind People / Objects except main Subject / Landscape" if (flux or qwen) else ("Remove Backgrounds behind People / Objects, keep it for Landscape or positioned Frames" if vace else "Remove Backgrounds behind People / Objects") , 1),
+                        (background_removal_label, 1),
                     ],
                     value=0 if no_background_removal else ui_defaults.get("remove_background_images_ref",1),
                     label="Automatic Removal of Background of People or Objects (Only)", scale = 3, visible= "I" in video_prompt_type_value and not no_background_removal
@@ -7578,7 +7596,7 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non
                 current_resolution_choice = ui_defaults.get("resolution","832x480") if update_form or last_resolution is None else last_resolution
                 model_resolutions = model_def.get("resolutions", None)
                 resolution_choices, current_resolution_choice = get_resolution_choices(current_resolution_choice, model_resolutions)
-                available_groups, selected_group_resolutions, selected_group = group_resolutions(resolution_choices, current_resolution_choice)
+                available_groups, selected_group_resolutions, selected_group = group_resolutions(model_def,resolution_choices, current_resolution_choice)
                 resolution_group = gr.Dropdown(
                 choices = available_groups,
                     value= selected_group,