Flux Festival

2025-11-04 14:16:57 +00:00 · 2025-09-11 21:23:05 +02:00 · 2025-09-11 21:23:05 +02:00 · 9fa267087b
commit 9fa267087b
parent 119162373a
15 changed files with 305 additions and 50 deletions
--- a/README.md
+++ b/README.md
@ -20,7 +20,7 @@ WanGP supports the Wan (and derived models), Hunyuan Video and LTV Video models
 **Follow DeepBeepMeep on Twitter/X to get the Latest News**: https://x.com/deepbeepmeep
 ## 🔥 Latest Updates : 
-### September 5 2025: WanGP v8.5 - Wanna be a Cropper or a Painter ?
+### September 11 2025: WanGP v8.5/8.55 - Wanna be a Cropper or a Painter ?
 I have done some intensive internal refactoring of the generation pipeline to ease support of existing models or add new models. Nothing really visible but this makes WanGP is little more future proof.
@ -38,6 +38,13 @@ Doing more sophisticated thing Vace Image Editor works very well too: try Image
 For the best quality I recommend to set in *Quality Tab* the option: "*Generate a 9 Frames Long video...*" 
 **update 8.55**: Flux Festival
 - **Inpainting Mode** also added for *Flux Kontext*
 - **Flux SRPO** : new finetune with x3 better quality vs Flux Dev according to its authors. I have also created a *Flux SRPO USO* finetune which is certainly the best open source *Style Transfer* tool available
 - **Flux UMO**: model specialized in combining multiple reference objects / people together. Works quite well at 768x768
 Good luck with finding your way through all the Flux models names !
 ### September 5 2025: WanGP v8.4 - Take me to Outer Space
 You have probably seen these short AI generated movies created using *Nano Banana* and the *First Frame - Last Frame* feature of *Kling 2.0*. The idea is to generate an image, modify a part of it with Nano Banana and give the these two images to Kling that will generate the Video between these two images, use now the previous Last Frame as the new First Frame, rinse and repeat and you get a full movie.
--- a/defaults/flux_dev_umo.json
+++ b/defaults/flux_dev_umo.json
@ -0,0 +1,24 @@
 {
    "model": {
        "name": "Flux 1 Dev UMO 12B",
        "architecture": "flux",
        "description": "FLUX.1 Dev UMO is a model that can Edit Images with a specialization in combining multiple image references (resized internally at 512x512 max) to produce an Image output. Best Image preservation at 768x768 Resolution Output.",
        "URLs": "flux",
 		"flux-model": "flux-dev-umo",		
 		"loras": ["https://huggingface.co/DeepBeepMeep/Flux/resolve/main/flux1-dev-UMO_dit_lora_bf16.safetensors"],
 		"resolutions":  [ ["1024x1024 (1:1)", "1024x1024"],
 						["768x1024 (3:4)", "768x1024"],
 						["1024x768 (4:3)", "1024x768"],
 						["512x1024 (1:2)", "512x1024"],
 						["1024x512 (2:1)", "1024x512"],
 						["768x768 (1:1)", "768x768"],
 						["768x512 (3:2)", "768x512"],
 						["512x768 (2:3)", "512x768"]]
    },	
 	"prompt": "the man is wearing a hat",
 	"embedded_guidance_scale": 4,
    "resolution": "768x768",
    "batch_size": 1
 }
--- a/defaults/flux_dev_uso.json
+++ b/defaults/flux_dev_uso.json
@ -2,7 +2,7 @@
    "model": {
        "name": "Flux 1 Dev USO 12B",
        "architecture": "flux",
-        "description": "FLUX.1 Dev USO is a model specialized to Edit Images with a specialization in Style Transfers (up to two).",
+        "description": "FLUX.1 Dev USO is a model that can Edit Images with a specialization in Style Transfers (up to two).",
 		"modules": [ ["https://huggingface.co/DeepBeepMeep/Flux/resolve/main/flux1-dev-USO_projector_bf16.safetensors"]],
        "URLs": "flux",
 		"loras": ["https://huggingface.co/DeepBeepMeep/Flux/resolve/main/flux1-dev-USO_dit_lora_bf16.safetensors"],
--- a/defaults/flux_srpo.json
+++ b/defaults/flux_srpo.json
@ -0,0 +1,15 @@
 {
    "model": {
        "name": "Flux 1 SRPO Dev 12B",
        "architecture": "flux",
        "description": "By fine-tuning the FLUX.1.dev model with optimized denoising and online reward adjustment, SRPO improves its human-evaluated realism and aesthetic quality by over 3x.",
        "URLs": [
            "https://huggingface.co/DeepBeepMeep/Flux/resolve/main/flux1-srpo-dev_bf16.safetensors",
            "https://huggingface.co/DeepBeepMeep/Flux/resolve/main/flux1-srpo-dev_quanto_bf16_int8.safetensors"
        ],
        "flux-model": "flux-dev"
    },
    "prompt": "draw a hat",
    "resolution": "1024x1024",
    "batch_size": 1
 }
--- a/defaults/flux_srpo_uso.json
+++ b/defaults/flux_srpo_uso.json
@ -0,0 +1,17 @@
 {
    "model": {
        "name": "Flux 1 SRPO USO 12B",
        "architecture": "flux",
        "description": "FLUX.1 SRPO USO is a model that can Edit Images with a specialization in Style Transfers (up to two). It leverages the improved Image quality brought by the SRPO process",
 		"modules": [ "flux_dev_uso"],
        "URLs": "flux_srpo",
 		"loras": "flux_dev_uso",
 		"flux-model": "flux-dev-uso"		
    },
 	"prompt": "the man is wearing a hat",
 	"embedded_guidance_scale": 4,
    "resolution": "1024x1024",
    "batch_size": 1
 }
--- a/models/flux/flux_handler.py
+++ b/models/flux/flux_handler.py
@ -13,6 +13,7 @@ class family_handler():
        flux_schnell = flux_model == "flux-schnell" 
        flux_chroma = flux_model == "flux-chroma" 
        flux_uso = flux_model == "flux-dev-uso"
        flux_umo = flux_model == "flux-dev-umo"
        flux_kontext = flux_model == "flux-dev-kontext"
        extra_model_def = {
@ -35,6 +36,7 @@ class family_handler():
            }
        if flux_kontext:
            extra_model_def["inpaint_support"] = True
            extra_model_def["image_ref_choices"] = {
                "choices": [
                    ("None", ""),
@ -43,6 +45,15 @@ class family_handler():
                    ],
                "letters_filter": "KI",
            }
            extra_model_def["background_removal_label"]= "Remove Backgrounds only behind People / Objects except main Subject / Landscape" 
        elif flux_umo:
            extra_model_def["image_ref_choices"] = {
                "choices": [
                    ("Conditional Images are People / Objects", "I"),
                    ],
                "letters_filter": "I",
                "visible": False
            }
        extra_model_def["lock_image_refs_ratios"] = True
@ -131,10 +142,14 @@ class family_handler():
                video_prompt_type = video_prompt_type.replace("I", "KI")
                ui_defaults["video_prompt_type"] = video_prompt_type 
        if settings_version < 2.34:
            ui_defaults["denoising_strength"] = 1.
    @staticmethod
    def update_default_settings(base_model_type, model_def, ui_defaults):
        flux_model = model_def.get("flux-model", "flux-dev")
        flux_uso = flux_model == "flux-dev-uso"
        flux_umo = flux_model == "flux-dev-umo"
        flux_kontext = flux_model == "flux-dev-kontext"
        ui_defaults.update({
            "embedded_guidance":  2.5,
@ -143,5 +158,12 @@ class family_handler():
        if flux_kontext or flux_uso:
            ui_defaults.update({
                "video_prompt_type": "KI",
                "denoising_strength": 1.,
            })
        elif flux_umo:
            ui_defaults.update({
                "video_prompt_type": "I",
                "remove_background_images_ref": 0,
            })
--- a/models/flux/flux_main.py
+++ b/models/flux/flux_main.py
@ -23,6 +23,35 @@ from .util import (
 )
 from PIL import Image
 def preprocess_ref(raw_image: Image.Image, long_size: int = 512):
    # 获取原始图像的宽度和高度
    image_w, image_h = raw_image.size
    # 计算长边和短边
    if image_w >= image_h:
        new_w = long_size
        new_h = int((long_size / image_w) * image_h)
    else:
        new_h = long_size
        new_w = int((long_size / image_h) * image_w)
    # 按新的宽高进行等比例缩放
    raw_image = raw_image.resize((new_w, new_h), resample=Image.LANCZOS)
    target_w = new_w // 16 * 16
    target_h = new_h // 16 * 16
    # 计算裁剪的起始坐标以实现中心裁剪
    left = (new_w - target_w) // 2
    top = (new_h - target_h) // 2
    right = left + target_w
    bottom = top + target_h
    # 进行中心裁剪
    raw_image = raw_image.crop((left, top, right, bottom))
    # 转换为 RGB 模式
    raw_image = raw_image.convert("RGB")
    return raw_image
 def stitch_images(img1, img2):
    # Resize img2 to match img1's height
@ -67,7 +96,7 @@ class model_factory:
        # self.name= "flux-schnell"
        source =  model_def.get("source", None)
        self.model = load_flow_model(self.name, model_filename[0] if source is None else source, torch_device)
-
+        self.model_def = model_def 
        self.vae = load_ae(self.name, device=torch_device)
        siglip_processor = siglip_model = feature_embedder = None
@ -113,6 +142,8 @@ class model_factory:
            n_prompt: str = None,
            sampling_steps: int = 20,
            input_ref_images = None,
            image_guide= None,
            image_mask= None,
            width= 832,
            height=480,
            embedded_guidance_scale: float = 2.5,
@ -124,6 +155,7 @@ class model_factory:
            video_prompt_type = "",
            joint_pass = False,
            image_refs_relative_size = 100,
            denoising_strength = 1.,
            **bbargs
    ):
            if self._interrupt:
@ -132,8 +164,16 @@ class model_factory:
            if n_prompt is None or len(n_prompt) == 0: n_prompt = "low quality, ugly, unfinished, out of focus, deformed, disfigure, blurry, smudged, restricted palette, flat colors"
            device="cuda"
            flux_dev_uso = self.name in ['flux-dev-uso']
-            image_stiching =  not self.name in ['flux-dev-uso'] #and False
+            flux_dev_umo = self.name in ['flux-dev-umo']
            latent_stiching =  self.name in ['flux-dev-uso', 'flux-dev-umo'] 
            lock_dimensions=  False
            input_ref_images = [] if input_ref_images is None else input_ref_images[:]
            if flux_dev_umo:
                ref_long_side = 512 if len(input_ref_images) <= 1 else 320
                input_ref_images = [preprocess_ref(img, ref_long_side) for img in input_ref_images]
                lock_dimensions = True
            ref_style_imgs = []
            if "I" in video_prompt_type and len(input_ref_images) > 0: 
                if flux_dev_uso :
@ -143,22 +183,26 @@ class model_factory:
                    elif len(input_ref_images) > 1 :
                        ref_style_imgs = input_ref_images[-1:]
                        input_ref_images = input_ref_images[:-1]
-                if image_stiching:
+
                if latent_stiching:
                    # latents stiching with resize 
                    if not lock_dimensions :
                        for i in range(len(input_ref_images)):
                            w, h = input_ref_images[i].size
                            image_height, image_width = calculate_new_dimensions(int(height*image_refs_relative_size/100), int(width*image_refs_relative_size/100), h, w, 0)
                            input_ref_images[i] = input_ref_images[i].resize((image_width, image_height), resample=Image.Resampling.LANCZOS) 
                else:
                    # image stiching method
                    stiched = input_ref_images[0]
                    for new_img in input_ref_images[1:]:
                        stiched = stitch_images(stiched, new_img)
                    input_ref_images  = [stiched]
-                else:
+            elif image_guide is not None:
-                    # latents stiching with resize 
+                input_ref_images = [image_guide] 
                    for i in range(len(input_ref_images)):
                        w, h = input_ref_images[i].size
                        image_height, image_width = calculate_new_dimensions(int(height*image_refs_relative_size/100), int(width*image_refs_relative_size/100), h, w, fit_into_canvas)
                        input_ref_images[i] = input_ref_images[i].resize((image_width, image_height), resample=Image.Resampling.LANCZOS) 
            else:
                input_ref_images = None
-            if flux_dev_uso :
+            if self.name in ['flux-dev-uso', 'flux-dev-umo']  :
                inp, height, width = prepare_multi_ip(
                    ae=self.vae,
                    img_cond_list=input_ref_images,
@ -177,6 +221,7 @@ class model_factory:
                    bs=batch_size,
                    seed=seed,
                    device=device,
                    img_mask=image_mask,
                )
            inp.update(prepare_prompt(self.t5, self.clip, batch_size, input_prompt))
@ -198,13 +243,19 @@ class model_factory:
                return unpack(x.float(), height, width) 
            # denoise initial noise
-            x = denoise(self.model, **inp, timesteps=timesteps, guidance=embedded_guidance_scale, real_guidance_scale =guide_scale, callback=callback, pipeline=self, loras_slists= loras_slists, unpack_latent = unpack_latent, joint_pass = joint_pass)
+            x = denoise(self.model, **inp, timesteps=timesteps, guidance=embedded_guidance_scale, real_guidance_scale =guide_scale, callback=callback, pipeline=self, loras_slists= loras_slists, unpack_latent = unpack_latent, joint_pass = joint_pass, denoising_strength = denoising_strength)
            if x==None: return None
            # decode latents to pixel space
            x = unpack_latent(x)
            with torch.autocast(device_type=device, dtype=torch.bfloat16):
                x = self.vae.decode(x)
            if image_mask is not None:
                from shared.utils.utils import convert_image_to_tensor
                img_msk_rebuilt = inp["img_msk_rebuilt"]
                img= convert_image_to_tensor(image_guide) 
                x = img.squeeze(2) * (1 - img_msk_rebuilt) + x.to(img) * img_msk_rebuilt 
            x = x.clamp(-1, 1)
            x = x.transpose(0, 1)
            return x
--- a/models/flux/model.py
+++ b/models/flux/model.py
@ -190,6 +190,21 @@ class Flux(nn.Module):
                            v = swap_scale_shift(v)
                        k = k.replace("norm_out.linear", "final_layer.adaLN_modulation.1")            
                new_sd[k] = v
        # elif not first_key.startswith("diffusion_model.") and not first_key.startswith("transformer."):
        #     for k,v in sd.items():
        #         if "double" in k:
        #             k = k.replace(".processor.proj_lora1.", ".img_attn.proj.lora_")
        #             k = k.replace(".processor.proj_lora2.", ".txt_attn.proj.lora_")
        #             k = k.replace(".processor.qkv_lora1.", ".img_attn.qkv.lora_")
        #             k = k.replace(".processor.qkv_lora2.", ".txt_attn.qkv.lora_")
        #         else:
        #             k = k.replace(".processor.qkv_lora.", ".linear1_qkv.lora_")
        #             k = k.replace(".processor.proj_lora.", ".linear2.lora_")
        #         k = "diffusion_model." + k
        #         new_sd[k] = v
        #     from mmgp import safetensors2
        #     safetensors2.torch_write_file(new_sd, "fff.safetensors")
        else:
            new_sd = sd
        return new_sd    
--- a/models/flux/sampling.py
+++ b/models/flux/sampling.py
@ -138,10 +138,12 @@ def prepare_kontext(
    target_width: int | None = None,
    target_height: int | None = None,
    bs: int = 1,
-
+    img_mask = None,
 ) -> tuple[dict[str, Tensor], int, int]:
    # load and encode the conditioning image
    res_match_output = img_mask is not None
    img_cond_seq = None
    img_cond_seq_ids = None
    if img_cond_list == None: img_cond_list = []
@ -150,7 +152,9 @@ def prepare_kontext(
    for cond_no, img_cond in enumerate(img_cond_list): 
        width, height = img_cond.size
        aspect_ratio = width / height
-
+        if res_match_output:
            width, height = target_width, target_height
        else:
            # Kontext is trained on specific resolutions, using one of them is recommended
            _, width, height = min((abs(aspect_ratio - w / h), w, h) for w, h in PREFERED_KONTEXT_RESOLUTIONS)
        width = 2 * int(width / 16)
@ -193,6 +197,19 @@ def prepare_kontext(
        "img_cond_seq": img_cond_seq,
        "img_cond_seq_ids": img_cond_seq_ids,
    }
    if img_mask is not None:
        from shared.utils.utils import convert_image_to_tensor, convert_tensor_to_image
        # image_height, image_width = calculate_new_dimensions(ref_height, ref_width, image_height, image_width, False, block_size=multiple_of)
        image_mask_latents = convert_image_to_tensor(img_mask.resize((target_width // 16, target_height // 16), resample=Image.Resampling.LANCZOS))
        image_mask_latents = torch.where(image_mask_latents>-0.5, 1., 0. )[0:1]
        image_mask_rebuilt = image_mask_latents.repeat_interleave(16, dim=-1).repeat_interleave(16, dim=-2).unsqueeze(0)
        convert_tensor_to_image( image_mask_rebuilt.squeeze(0).repeat(3,1,1)).save("mmm.png")
        image_mask_latents = image_mask_latents.reshape(1, -1, 1).to(device)        
        return_dict.update({
            "img_msk_latents": image_mask_latents,
            "img_msk_rebuilt": image_mask_rebuilt,
        })
    img = get_noise(
        bs,
        target_height,
@ -264,6 +281,9 @@ def denoise(
    loras_slists=None,
    unpack_latent = None,
    joint_pass= False,
    img_msk_latents = None,
    img_msk_rebuilt = None,
    denoising_strength = 1,
 ):
    kwargs = {'pipeline': pipeline, 'callback': callback, "img_len" : img.shape[1], "siglip_embedding": siglip_embedding, "siglip_embedding_ids": siglip_embedding_ids}
@ -271,6 +291,21 @@ def denoise(
    if callback != None:
        callback(-1, None, True)
    original_image_latents = None if img_cond_seq is None else img_cond_seq.clone() 
    morph, first_step = False, 0
    if img_msk_latents is not None:
        randn = torch.randn_like(original_image_latents)
        if denoising_strength < 1.:
            first_step = int(len(timesteps) * (1. - denoising_strength))
        if not morph:
            latent_noise_factor = timesteps[first_step]
            latents  = original_image_latents  * (1.0 - latent_noise_factor) + randn * latent_noise_factor
            img = latents.to(img)
            latents = None
            timesteps = timesteps[first_step:]
    updated_num_steps= len(timesteps) -1
    if callback != None:
        from shared.utils.loras_mutipliers import update_loras_slists
@ -280,10 +315,14 @@ def denoise(
    # this is ignored for schnell
    guidance_vec = torch.full((img.shape[0],), guidance, device=img.device, dtype=img.dtype)
    for i, (t_curr, t_prev) in enumerate(zip(timesteps[:-1], timesteps[1:])):
-        offload.set_step_no_for_lora(model, i)
+        offload.set_step_no_for_lora(model, first_step  + i)
        if pipeline._interrupt:
            return None
        if img_msk_latents is not None and denoising_strength <1. and i == first_step and morph:
            latent_noise_factor = t_curr/1000
            img  = original_image_latents  * (1.0 - latent_noise_factor) + img * latent_noise_factor 
        t_vec = torch.full((img.shape[0],), t_curr, dtype=img.dtype, device=img.device)
        img_input = img
        img_input_ids = img_ids
@ -333,6 +372,14 @@ def denoise(
            pred = neg_pred + real_guidance_scale * (pred - neg_pred)
        img += (t_prev - t_curr) * pred
        if img_msk_latents is not None:
            latent_noise_factor = t_prev
            # noisy_image  = original_image_latents  * (1.0 - latent_noise_factor) + torch.randn_like(original_image_latents) * latent_noise_factor 
            noisy_image  = original_image_latents  * (1.0 - latent_noise_factor) + randn * latent_noise_factor 
            img  =  noisy_image * (1-img_msk_latents)  + img_msk_latents * img
            noisy_image = None
        if callback is not None:
            preview = unpack_latent(img).transpose(0,1)
            callback(i, preview, False)         
--- a/models/flux/util.py
+++ b/models/flux/util.py
@ -640,6 +640,38 @@ configs = {
            shift_factor=0.1159,
        ),
    ),
    "flux-dev-umo": ModelSpec(
        repo_id="",
        repo_flow="",
        repo_ae="ckpts/flux_vae.safetensors",
        params=FluxParams(
            in_channels=64,
            out_channels=64,
            vec_in_dim=768,
            context_in_dim=4096,
            hidden_size=3072,
            mlp_ratio=4.0,
            num_heads=24,
            depth=19,
            depth_single_blocks=38,
            axes_dim=[16, 56, 56],
            theta=10_000,
            qkv_bias=True,
            guidance_embed=True,
            eso= True,
        ),
        ae_params=AutoEncoderParams(
            resolution=256,
            in_channels=3,
            ch=128,
            out_ch=3,
            ch_mult=[1, 2, 4, 4],
            num_res_blocks=2,
            z_channels=16,
            scale_factor=0.3611,
            shift_factor=0.1159,
        ),
    ),
 }
--- a/models/qwen/pipeline_qwenimage.py
+++ b/models/qwen/pipeline_qwenimage.py
@ -714,14 +714,14 @@ class QwenImagePipeline(): #DiffusionPipeline
                image_mask_latents = convert_image_to_tensor(image_mask.resize((width // 16, height // 16), resample=Image.Resampling.LANCZOS))
                image_mask_latents = torch.where(image_mask_latents>-0.5, 1., 0. )[0:1]
                image_mask_rebuilt = image_mask_latents.repeat_interleave(16, dim=-1).repeat_interleave(16, dim=-2).unsqueeze(0)
-                convert_tensor_to_image( image_mask_rebuilt.squeeze(0).repeat(3,1,1)).save("mmm.png")
+                # convert_tensor_to_image( image_mask_rebuilt.squeeze(0).repeat(3,1,1)).save("mmm.png")
                image_mask_latents = image_mask_latents.reshape(1, -1, 1).to(device)
            prompt_image = image
            if image.size != (image_width, image_height):
                image = image.resize((image_width, image_height), resample=Image.Resampling.LANCZOS)
-            image.save("nnn.png")
+            # image.save("nnn.png")
            image = convert_image_to_tensor(image).unsqueeze(0).unsqueeze(2)
        has_neg_prompt = negative_prompt is not None or (
@ -811,12 +811,15 @@ class QwenImagePipeline(): #DiffusionPipeline
        negative_txt_seq_lens = (
            negative_prompt_embeds_mask.sum(dim=1).tolist() if negative_prompt_embeds_mask is not None else None
        )
-        morph = False
+        morph, first_step = False, 0
-        if image_mask_latents is not None and denoising_strength <= 1.:
+        if image_mask_latents is not None:
            randn = torch.randn_like(original_image_latents)
            if denoising_strength < 1.:
                first_step = int(len(timesteps) * (1. - denoising_strength))
            if not morph:
                latent_noise_factor = timesteps[first_step]/1000
-                latents  = original_image_latents  * (1.0 - latent_noise_factor) + torch.randn_like(original_image_latents) * latent_noise_factor 
+                # latents  = original_image_latents  * (1.0 - latent_noise_factor) + torch.randn_like(original_image_latents) * latent_noise_factor 
                latents  = original_image_latents  * (1.0 - latent_noise_factor) + randn * latent_noise_factor 
                timesteps = timesteps[first_step:]
                self.scheduler.timesteps = timesteps
                self.scheduler.sigmas= self.scheduler.sigmas[first_step:]
@ -831,6 +834,7 @@ class QwenImagePipeline(): #DiffusionPipeline
        for i, t in enumerate(timesteps):
            offload.set_step_no_for_lora(self.transformer, first_step  + i)
            if self.interrupt:
                continue
@ -905,7 +909,8 @@ class QwenImagePipeline(): #DiffusionPipeline
            if image_mask_latents is not None:
                next_t = timesteps[i+1] if i<len(timesteps)-1 else 0
                latent_noise_factor = next_t / 1000
-                noisy_image  = original_image_latents  * (1.0 - latent_noise_factor) + torch.randn_like(original_image_latents) * latent_noise_factor 
+                    # noisy_image  = original_image_latents  * (1.0 - latent_noise_factor) + torch.randn_like(original_image_latents) * latent_noise_factor 
                noisy_image  = original_image_latents  * (1.0 - latent_noise_factor) + randn * latent_noise_factor 
                latents  =  noisy_image * (1-image_mask_latents)  + image_mask_latents * latents
                noisy_image = None
--- a/models/qwen/qwen_handler.py
+++ b/models/qwen/qwen_handler.py
@ -28,6 +28,7 @@ class family_handler():
                ],
            "letters_filter": "KI",
            }
            extra_model_def["background_removal_label"]= "Remove Backgrounds only behind People / Objects except main Subject / Landscape" 
        return extra_model_def
--- a/models/wan/any2video.py
+++ b/models/wan/any2video.py
@ -846,7 +846,7 @@ class WanAny2V:
        for i, t in enumerate(tqdm(timesteps)):
            guide_scale, guidance_switch_done, trans, denoising_extra = update_guidance(i, t, guide_scale, guide2_scale, guidance_switch_done, switch_threshold, trans, 2, denoising_extra)
            guide_scale, guidance_switch2_done, trans, denoising_extra = update_guidance(i, t, guide_scale, guide3_scale, guidance_switch2_done, switch2_threshold, trans, 3, denoising_extra)
-            offload.set_step_no_for_lora(trans, i)
+            offload.set_step_no_for_lora(trans, start_step_no + i)
            timestep = torch.stack([t])
            if timestep_injection:
--- a/models/wan/wan_handler.py
+++ b/models/wan/wan_handler.py
@ -165,6 +165,7 @@ class family_handler():
            }
            extra_model_def["lock_image_refs_ratios"] = True
            extra_model_def["background_removal_label"]= "Remove Backgrounds behind People / Objects, keep it for Landscape or positioned Frames"
        if base_model_type in ["standin"]: 
            extra_model_def["lock_image_refs_ratios"] = True
--- a/wgp.py
+++ b/wgp.py
@ -61,8 +61,8 @@ AUTOSAVE_FILENAME = "queue.zip"
 PROMPT_VARS_MAX = 10
 target_mmgp_version = "3.6.0"
-WanGP_version = "8.5"
+WanGP_version = "8.55"
-settings_version = 2.33
+settings_version = 2.34
 max_source_video_frames = 3000
 prompt_enhancer_image_caption_model, prompt_enhancer_image_caption_processor, prompt_enhancer_llm_model, prompt_enhancer_llm_tokenizer = None, None, None, None
@ -487,7 +487,6 @@ def process_prompt_and_add_tasks(state, model_choice):
            image_mask = None
        if "G" in video_prompt_type:
            if image_mode == 0:
                gr.Info(f"With Denoising Strength {denoising_strength:.1f}, denoising will start at Step no {int(num_inference_steps * (1. - denoising_strength))} ")
        else: 
            denoising_strength = 1.0
@ -552,11 +551,13 @@ def process_prompt_and_add_tasks(state, model_choice):
    if test_any_sliding_window(model_type) and image_mode == 0:
        if video_length > sliding_window_size:
            if model_type in ["t2v"] and not "G" in video_prompt_type :
                gr.Info(f"You have requested to Generate Sliding Windows with a Text to Video model. Unless you use the Video to Video feature this is useless as a t2v model doesn't see past frames and it will generate the same video in each new window.") 
                return
            full_video_length = video_length if video_source is None else video_length +  sliding_window_overlap -1
            extra = "" if full_video_length == video_length else f" including {sliding_window_overlap} added for Video Continuation"
            no_windows = compute_sliding_window_no(full_video_length, sliding_window_size, sliding_window_discard_last_frames, sliding_window_overlap)
            gr.Info(f"The Number of Frames to generate ({video_length}{extra}) is greater than the Sliding Window Size ({sliding_window_size}), {no_windows} Windows will be generated")
    if "recam" in model_filename:
        if video_guide == None:
            gr.Info("You must provide a Control Video")
@ -7019,8 +7020,14 @@ def categorize_resolution(resolution_str):
            return group
    return "1440p"
-def group_resolutions(resolutions, selected_resolution):
+def group_resolutions(model_def, resolutions, selected_resolution):
    model_resolutions = model_def.get("resolutions", None)
    if model_resolutions is not None:
        selected_group ="Locked"
        available_groups = [selected_group ]
        selected_group_resolutions = model_resolutions
    else:
        grouped_resolutions = {}
        for resolution in resolutions:
            group = categorize_resolution(resolution[1])
@ -7040,7 +7047,11 @@ def change_resolution_group(state, selected_group):
    model_def = get_model_def(model_type)
    model_resolutions = model_def.get("resolutions", None)
    resolution_choices, _ = get_resolution_choices(None, model_resolutions)   
    if model_resolutions is None:
        group_resolution_choices = [ resolution for resolution in resolution_choices if categorize_resolution(resolution[1]) == selected_group ]
    else:
        last_resolution = group_resolution_choices[0][1]
        return gr.update(choices= group_resolution_choices, value= last_resolution) 
    last_resolution_per_group = state["last_resolution_per_group"]
    last_resolution = last_resolution_per_group.get(selected_group, "")
@ -7051,6 +7062,11 @@ def change_resolution_group(state, selected_group):
 def record_last_resolution(state, resolution):
    model_type = state["model_type"]
    model_def = get_model_def(model_type)
    model_resolutions = model_def.get("resolutions", None)
    if model_resolutions is not None: return
    server_config["last_resolution_choice"] = resolution
    selected_group = categorize_resolution(resolution)
    last_resolution_per_group = state["last_resolution_per_group"]
@ -7482,11 +7498,13 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non
                frames_positions = gr.Text(value=ui_defaults.get("frames_positions","") , visible= "F" in video_prompt_type_value, scale = 2, label= "Positions of Injected Frames separated by Spaces (1=first, no position for Objects / People)" ) 
                image_refs_relative_size = gr.Slider(20, 100, value=ui_defaults.get("image_refs_relative_size", 50), step=1, label="Rescale Internaly Image Ref (% in relation to Output Video) to change Output Composition", visible = model_def.get("any_image_refs_relative_size", False) and image_outputs)
-                no_background_removal = model_def.get("no_background_removal", False)
+                no_background_removal = model_def.get("no_background_removal", False) or image_ref_choices is None
                background_removal_label = model_def.get("background_removal_label", "Remove Backgrounds behind People / Objects") 
                remove_background_images_ref = gr.Dropdown(
                    choices=[
                        ("Keep Backgrounds behind all Reference Images", 0),
-                        ("Remove Backgrounds only behind People / Objects except main Subject / Landscape" if (flux or qwen) else ("Remove Backgrounds behind People / Objects, keep it for Landscape or positioned Frames" if vace else "Remove Backgrounds behind People / Objects") , 1),
+                        (background_removal_label, 1),
                    ],
                    value=0 if no_background_removal else ui_defaults.get("remove_background_images_ref",1),
                    label="Automatic Removal of Background of People or Objects (Only)", scale = 3, visible= "I" in video_prompt_type_value and not no_background_removal
@ -7578,7 +7596,7 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non
                current_resolution_choice = ui_defaults.get("resolution","832x480") if update_form or last_resolution is None else last_resolution
                model_resolutions = model_def.get("resolutions", None)
                resolution_choices, current_resolution_choice = get_resolution_choices(current_resolution_choice, model_resolutions)
-                available_groups, selected_group_resolutions, selected_group = group_resolutions(resolution_choices, current_resolution_choice)
+                available_groups, selected_group_resolutions, selected_group = group_resolutions(model_def,resolution_choices, current_resolution_choice)
                resolution_group = gr.Dropdown(
                choices = available_groups,
                    value= selected_group,