Flux Festival

This commit is contained in:
DeepBeepMeep 2025-09-11 21:23:05 +02:00
parent 119162373a
commit 9fa267087b
15 changed files with 305 additions and 50 deletions

View File

@ -20,7 +20,7 @@ WanGP supports the Wan (and derived models), Hunyuan Video and LTV Video models
**Follow DeepBeepMeep on Twitter/X to get the Latest News**: https://x.com/deepbeepmeep
## 🔥 Latest Updates :
### September 5 2025: WanGP v8.5 - Wanna be a Cropper or a Painter ?
### September 11 2025: WanGP v8.5/8.55 - Wanna be a Cropper or a Painter ?
I have done some intensive internal refactoring of the generation pipeline to ease support of existing models or add new models. Nothing really visible but this makes WanGP is little more future proof.
@ -38,6 +38,13 @@ Doing more sophisticated thing Vace Image Editor works very well too: try Image
For the best quality I recommend to set in *Quality Tab* the option: "*Generate a 9 Frames Long video...*"
**update 8.55**: Flux Festival
- **Inpainting Mode** also added for *Flux Kontext*
- **Flux SRPO** : new finetune with x3 better quality vs Flux Dev according to its authors. I have also created a *Flux SRPO USO* finetune which is certainly the best open source *Style Transfer* tool available
- **Flux UMO**: model specialized in combining multiple reference objects / people together. Works quite well at 768x768
Good luck with finding your way through all the Flux models names !
### September 5 2025: WanGP v8.4 - Take me to Outer Space
You have probably seen these short AI generated movies created using *Nano Banana* and the *First Frame - Last Frame* feature of *Kling 2.0*. The idea is to generate an image, modify a part of it with Nano Banana and give the these two images to Kling that will generate the Video between these two images, use now the previous Last Frame as the new First Frame, rinse and repeat and you get a full movie.

View File

@ -0,0 +1,24 @@
{
"model": {
"name": "Flux 1 Dev UMO 12B",
"architecture": "flux",
"description": "FLUX.1 Dev UMO is a model that can Edit Images with a specialization in combining multiple image references (resized internally at 512x512 max) to produce an Image output. Best Image preservation at 768x768 Resolution Output.",
"URLs": "flux",
"flux-model": "flux-dev-umo",
"loras": ["https://huggingface.co/DeepBeepMeep/Flux/resolve/main/flux1-dev-UMO_dit_lora_bf16.safetensors"],
"resolutions": [ ["1024x1024 (1:1)", "1024x1024"],
["768x1024 (3:4)", "768x1024"],
["1024x768 (4:3)", "1024x768"],
["512x1024 (1:2)", "512x1024"],
["1024x512 (2:1)", "1024x512"],
["768x768 (1:1)", "768x768"],
["768x512 (3:2)", "768x512"],
["512x768 (2:3)", "512x768"]]
},
"prompt": "the man is wearing a hat",
"embedded_guidance_scale": 4,
"resolution": "768x768",
"batch_size": 1
}

View File

@ -2,7 +2,7 @@
"model": {
"name": "Flux 1 Dev USO 12B",
"architecture": "flux",
"description": "FLUX.1 Dev USO is a model specialized to Edit Images with a specialization in Style Transfers (up to two).",
"description": "FLUX.1 Dev USO is a model that can Edit Images with a specialization in Style Transfers (up to two).",
"modules": [ ["https://huggingface.co/DeepBeepMeep/Flux/resolve/main/flux1-dev-USO_projector_bf16.safetensors"]],
"URLs": "flux",
"loras": ["https://huggingface.co/DeepBeepMeep/Flux/resolve/main/flux1-dev-USO_dit_lora_bf16.safetensors"],

15
defaults/flux_srpo.json Normal file
View File

@ -0,0 +1,15 @@
{
"model": {
"name": "Flux 1 SRPO Dev 12B",
"architecture": "flux",
"description": "By fine-tuning the FLUX.1.dev model with optimized denoising and online reward adjustment, SRPO improves its human-evaluated realism and aesthetic quality by over 3x.",
"URLs": [
"https://huggingface.co/DeepBeepMeep/Flux/resolve/main/flux1-srpo-dev_bf16.safetensors",
"https://huggingface.co/DeepBeepMeep/Flux/resolve/main/flux1-srpo-dev_quanto_bf16_int8.safetensors"
],
"flux-model": "flux-dev"
},
"prompt": "draw a hat",
"resolution": "1024x1024",
"batch_size": 1
}

View File

@ -0,0 +1,17 @@
{
"model": {
"name": "Flux 1 SRPO USO 12B",
"architecture": "flux",
"description": "FLUX.1 SRPO USO is a model that can Edit Images with a specialization in Style Transfers (up to two). It leverages the improved Image quality brought by the SRPO process",
"modules": [ "flux_dev_uso"],
"URLs": "flux_srpo",
"loras": "flux_dev_uso",
"flux-model": "flux-dev-uso"
},
"prompt": "the man is wearing a hat",
"embedded_guidance_scale": 4,
"resolution": "1024x1024",
"batch_size": 1
}

View File

@ -13,6 +13,7 @@ class family_handler():
flux_schnell = flux_model == "flux-schnell"
flux_chroma = flux_model == "flux-chroma"
flux_uso = flux_model == "flux-dev-uso"
flux_umo = flux_model == "flux-dev-umo"
flux_kontext = flux_model == "flux-dev-kontext"
extra_model_def = {
@ -35,6 +36,7 @@ class family_handler():
}
if flux_kontext:
extra_model_def["inpaint_support"] = True
extra_model_def["image_ref_choices"] = {
"choices": [
("None", ""),
@ -43,6 +45,15 @@ class family_handler():
],
"letters_filter": "KI",
}
extra_model_def["background_removal_label"]= "Remove Backgrounds only behind People / Objects except main Subject / Landscape"
elif flux_umo:
extra_model_def["image_ref_choices"] = {
"choices": [
("Conditional Images are People / Objects", "I"),
],
"letters_filter": "I",
"visible": False
}
extra_model_def["lock_image_refs_ratios"] = True
@ -131,10 +142,14 @@ class family_handler():
video_prompt_type = video_prompt_type.replace("I", "KI")
ui_defaults["video_prompt_type"] = video_prompt_type
if settings_version < 2.34:
ui_defaults["denoising_strength"] = 1.
@staticmethod
def update_default_settings(base_model_type, model_def, ui_defaults):
flux_model = model_def.get("flux-model", "flux-dev")
flux_uso = flux_model == "flux-dev-uso"
flux_umo = flux_model == "flux-dev-umo"
flux_kontext = flux_model == "flux-dev-kontext"
ui_defaults.update({
"embedded_guidance": 2.5,
@ -143,5 +158,12 @@ class family_handler():
if flux_kontext or flux_uso:
ui_defaults.update({
"video_prompt_type": "KI",
"denoising_strength": 1.,
})
elif flux_umo:
ui_defaults.update({
"video_prompt_type": "I",
"remove_background_images_ref": 0,
})

View File

@ -23,6 +23,35 @@ from .util import (
)
from PIL import Image
def preprocess_ref(raw_image: Image.Image, long_size: int = 512):
# 获取原始图像的宽度和高度
image_w, image_h = raw_image.size
# 计算长边和短边
if image_w >= image_h:
new_w = long_size
new_h = int((long_size / image_w) * image_h)
else:
new_h = long_size
new_w = int((long_size / image_h) * image_w)
# 按新的宽高进行等比例缩放
raw_image = raw_image.resize((new_w, new_h), resample=Image.LANCZOS)
target_w = new_w // 16 * 16
target_h = new_h // 16 * 16
# 计算裁剪的起始坐标以实现中心裁剪
left = (new_w - target_w) // 2
top = (new_h - target_h) // 2
right = left + target_w
bottom = top + target_h
# 进行中心裁剪
raw_image = raw_image.crop((left, top, right, bottom))
# 转换为 RGB 模式
raw_image = raw_image.convert("RGB")
return raw_image
def stitch_images(img1, img2):
# Resize img2 to match img1's height
@ -67,7 +96,7 @@ class model_factory:
# self.name= "flux-schnell"
source = model_def.get("source", None)
self.model = load_flow_model(self.name, model_filename[0] if source is None else source, torch_device)
self.model_def = model_def
self.vae = load_ae(self.name, device=torch_device)
siglip_processor = siglip_model = feature_embedder = None
@ -109,10 +138,12 @@ class model_factory:
def generate(
self,
seed: int | None = None,
input_prompt: str = "replace the logo with the text 'Black Forest Labs'",
input_prompt: str = "replace the logo with the text 'Black Forest Labs'",
n_prompt: str = None,
sampling_steps: int = 20,
input_ref_images = None,
image_guide= None,
image_mask= None,
width= 832,
height=480,
embedded_guidance_scale: float = 2.5,
@ -123,7 +154,8 @@ class model_factory:
batch_size = 1,
video_prompt_type = "",
joint_pass = False,
image_refs_relative_size = 100,
image_refs_relative_size = 100,
denoising_strength = 1.,
**bbargs
):
if self._interrupt:
@ -132,8 +164,16 @@ class model_factory:
if n_prompt is None or len(n_prompt) == 0: n_prompt = "low quality, ugly, unfinished, out of focus, deformed, disfigure, blurry, smudged, restricted palette, flat colors"
device="cuda"
flux_dev_uso = self.name in ['flux-dev-uso']
image_stiching = not self.name in ['flux-dev-uso'] #and False
flux_dev_umo = self.name in ['flux-dev-umo']
latent_stiching = self.name in ['flux-dev-uso', 'flux-dev-umo']
lock_dimensions= False
input_ref_images = [] if input_ref_images is None else input_ref_images[:]
if flux_dev_umo:
ref_long_side = 512 if len(input_ref_images) <= 1 else 320
input_ref_images = [preprocess_ref(img, ref_long_side) for img in input_ref_images]
lock_dimensions = True
ref_style_imgs = []
if "I" in video_prompt_type and len(input_ref_images) > 0:
if flux_dev_uso :
@ -143,22 +183,26 @@ class model_factory:
elif len(input_ref_images) > 1 :
ref_style_imgs = input_ref_images[-1:]
input_ref_images = input_ref_images[:-1]
if image_stiching:
if latent_stiching:
# latents stiching with resize
if not lock_dimensions :
for i in range(len(input_ref_images)):
w, h = input_ref_images[i].size
image_height, image_width = calculate_new_dimensions(int(height*image_refs_relative_size/100), int(width*image_refs_relative_size/100), h, w, 0)
input_ref_images[i] = input_ref_images[i].resize((image_width, image_height), resample=Image.Resampling.LANCZOS)
else:
# image stiching method
stiched = input_ref_images[0]
for new_img in input_ref_images[1:]:
stiched = stitch_images(stiched, new_img)
input_ref_images = [stiched]
else:
# latents stiching with resize
for i in range(len(input_ref_images)):
w, h = input_ref_images[i].size
image_height, image_width = calculate_new_dimensions(int(height*image_refs_relative_size/100), int(width*image_refs_relative_size/100), h, w, fit_into_canvas)
input_ref_images[i] = input_ref_images[i].resize((image_width, image_height), resample=Image.Resampling.LANCZOS)
elif image_guide is not None:
input_ref_images = [image_guide]
else:
input_ref_images = None
if flux_dev_uso :
if self.name in ['flux-dev-uso', 'flux-dev-umo'] :
inp, height, width = prepare_multi_ip(
ae=self.vae,
img_cond_list=input_ref_images,
@ -177,6 +221,7 @@ class model_factory:
bs=batch_size,
seed=seed,
device=device,
img_mask=image_mask,
)
inp.update(prepare_prompt(self.t5, self.clip, batch_size, input_prompt))
@ -198,13 +243,19 @@ class model_factory:
return unpack(x.float(), height, width)
# denoise initial noise
x = denoise(self.model, **inp, timesteps=timesteps, guidance=embedded_guidance_scale, real_guidance_scale =guide_scale, callback=callback, pipeline=self, loras_slists= loras_slists, unpack_latent = unpack_latent, joint_pass = joint_pass)
x = denoise(self.model, **inp, timesteps=timesteps, guidance=embedded_guidance_scale, real_guidance_scale =guide_scale, callback=callback, pipeline=self, loras_slists= loras_slists, unpack_latent = unpack_latent, joint_pass = joint_pass, denoising_strength = denoising_strength)
if x==None: return None
# decode latents to pixel space
x = unpack_latent(x)
with torch.autocast(device_type=device, dtype=torch.bfloat16):
x = self.vae.decode(x)
if image_mask is not None:
from shared.utils.utils import convert_image_to_tensor
img_msk_rebuilt = inp["img_msk_rebuilt"]
img= convert_image_to_tensor(image_guide)
x = img.squeeze(2) * (1 - img_msk_rebuilt) + x.to(img) * img_msk_rebuilt
x = x.clamp(-1, 1)
x = x.transpose(0, 1)
return x

View File

@ -190,6 +190,21 @@ class Flux(nn.Module):
v = swap_scale_shift(v)
k = k.replace("norm_out.linear", "final_layer.adaLN_modulation.1")
new_sd[k] = v
# elif not first_key.startswith("diffusion_model.") and not first_key.startswith("transformer."):
# for k,v in sd.items():
# if "double" in k:
# k = k.replace(".processor.proj_lora1.", ".img_attn.proj.lora_")
# k = k.replace(".processor.proj_lora2.", ".txt_attn.proj.lora_")
# k = k.replace(".processor.qkv_lora1.", ".img_attn.qkv.lora_")
# k = k.replace(".processor.qkv_lora2.", ".txt_attn.qkv.lora_")
# else:
# k = k.replace(".processor.qkv_lora.", ".linear1_qkv.lora_")
# k = k.replace(".processor.proj_lora.", ".linear2.lora_")
# k = "diffusion_model." + k
# new_sd[k] = v
# from mmgp import safetensors2
# safetensors2.torch_write_file(new_sd, "fff.safetensors")
else:
new_sd = sd
return new_sd

View File

@ -138,10 +138,12 @@ def prepare_kontext(
target_width: int | None = None,
target_height: int | None = None,
bs: int = 1,
img_mask = None,
) -> tuple[dict[str, Tensor], int, int]:
# load and encode the conditioning image
res_match_output = img_mask is not None
img_cond_seq = None
img_cond_seq_ids = None
if img_cond_list == None: img_cond_list = []
@ -150,9 +152,11 @@ def prepare_kontext(
for cond_no, img_cond in enumerate(img_cond_list):
width, height = img_cond.size
aspect_ratio = width / height
# Kontext is trained on specific resolutions, using one of them is recommended
_, width, height = min((abs(aspect_ratio - w / h), w, h) for w, h in PREFERED_KONTEXT_RESOLUTIONS)
if res_match_output:
width, height = target_width, target_height
else:
# Kontext is trained on specific resolutions, using one of them is recommended
_, width, height = min((abs(aspect_ratio - w / h), w, h) for w, h in PREFERED_KONTEXT_RESOLUTIONS)
width = 2 * int(width / 16)
height = 2 * int(height / 16)
@ -193,6 +197,19 @@ def prepare_kontext(
"img_cond_seq": img_cond_seq,
"img_cond_seq_ids": img_cond_seq_ids,
}
if img_mask is not None:
from shared.utils.utils import convert_image_to_tensor, convert_tensor_to_image
# image_height, image_width = calculate_new_dimensions(ref_height, ref_width, image_height, image_width, False, block_size=multiple_of)
image_mask_latents = convert_image_to_tensor(img_mask.resize((target_width // 16, target_height // 16), resample=Image.Resampling.LANCZOS))
image_mask_latents = torch.where(image_mask_latents>-0.5, 1., 0. )[0:1]
image_mask_rebuilt = image_mask_latents.repeat_interleave(16, dim=-1).repeat_interleave(16, dim=-2).unsqueeze(0)
convert_tensor_to_image( image_mask_rebuilt.squeeze(0).repeat(3,1,1)).save("mmm.png")
image_mask_latents = image_mask_latents.reshape(1, -1, 1).to(device)
return_dict.update({
"img_msk_latents": image_mask_latents,
"img_msk_rebuilt": image_mask_rebuilt,
})
img = get_noise(
bs,
target_height,
@ -264,6 +281,9 @@ def denoise(
loras_slists=None,
unpack_latent = None,
joint_pass= False,
img_msk_latents = None,
img_msk_rebuilt = None,
denoising_strength = 1,
):
kwargs = {'pipeline': pipeline, 'callback': callback, "img_len" : img.shape[1], "siglip_embedding": siglip_embedding, "siglip_embedding_ids": siglip_embedding_ids}
@ -271,6 +291,21 @@ def denoise(
if callback != None:
callback(-1, None, True)
original_image_latents = None if img_cond_seq is None else img_cond_seq.clone()
morph, first_step = False, 0
if img_msk_latents is not None:
randn = torch.randn_like(original_image_latents)
if denoising_strength < 1.:
first_step = int(len(timesteps) * (1. - denoising_strength))
if not morph:
latent_noise_factor = timesteps[first_step]
latents = original_image_latents * (1.0 - latent_noise_factor) + randn * latent_noise_factor
img = latents.to(img)
latents = None
timesteps = timesteps[first_step:]
updated_num_steps= len(timesteps) -1
if callback != None:
from shared.utils.loras_mutipliers import update_loras_slists
@ -280,10 +315,14 @@ def denoise(
# this is ignored for schnell
guidance_vec = torch.full((img.shape[0],), guidance, device=img.device, dtype=img.dtype)
for i, (t_curr, t_prev) in enumerate(zip(timesteps[:-1], timesteps[1:])):
offload.set_step_no_for_lora(model, i)
offload.set_step_no_for_lora(model, first_step + i)
if pipeline._interrupt:
return None
if img_msk_latents is not None and denoising_strength <1. and i == first_step and morph:
latent_noise_factor = t_curr/1000
img = original_image_latents * (1.0 - latent_noise_factor) + img * latent_noise_factor
t_vec = torch.full((img.shape[0],), t_curr, dtype=img.dtype, device=img.device)
img_input = img
img_input_ids = img_ids
@ -333,6 +372,14 @@ def denoise(
pred = neg_pred + real_guidance_scale * (pred - neg_pred)
img += (t_prev - t_curr) * pred
if img_msk_latents is not None:
latent_noise_factor = t_prev
# noisy_image = original_image_latents * (1.0 - latent_noise_factor) + torch.randn_like(original_image_latents) * latent_noise_factor
noisy_image = original_image_latents * (1.0 - latent_noise_factor) + randn * latent_noise_factor
img = noisy_image * (1-img_msk_latents) + img_msk_latents * img
noisy_image = None
if callback is not None:
preview = unpack_latent(img).transpose(0,1)
callback(i, preview, False)

View File

@ -640,6 +640,38 @@ configs = {
shift_factor=0.1159,
),
),
"flux-dev-umo": ModelSpec(
repo_id="",
repo_flow="",
repo_ae="ckpts/flux_vae.safetensors",
params=FluxParams(
in_channels=64,
out_channels=64,
vec_in_dim=768,
context_in_dim=4096,
hidden_size=3072,
mlp_ratio=4.0,
num_heads=24,
depth=19,
depth_single_blocks=38,
axes_dim=[16, 56, 56],
theta=10_000,
qkv_bias=True,
guidance_embed=True,
eso= True,
),
ae_params=AutoEncoderParams(
resolution=256,
in_channels=3,
ch=128,
out_ch=3,
ch_mult=[1, 2, 4, 4],
num_res_blocks=2,
z_channels=16,
scale_factor=0.3611,
shift_factor=0.1159,
),
),
}

View File

@ -714,14 +714,14 @@ class QwenImagePipeline(): #DiffusionPipeline
image_mask_latents = convert_image_to_tensor(image_mask.resize((width // 16, height // 16), resample=Image.Resampling.LANCZOS))
image_mask_latents = torch.where(image_mask_latents>-0.5, 1., 0. )[0:1]
image_mask_rebuilt = image_mask_latents.repeat_interleave(16, dim=-1).repeat_interleave(16, dim=-2).unsqueeze(0)
convert_tensor_to_image( image_mask_rebuilt.squeeze(0).repeat(3,1,1)).save("mmm.png")
# convert_tensor_to_image( image_mask_rebuilt.squeeze(0).repeat(3,1,1)).save("mmm.png")
image_mask_latents = image_mask_latents.reshape(1, -1, 1).to(device)
prompt_image = image
if image.size != (image_width, image_height):
image = image.resize((image_width, image_height), resample=Image.Resampling.LANCZOS)
image.save("nnn.png")
# image.save("nnn.png")
image = convert_image_to_tensor(image).unsqueeze(0).unsqueeze(2)
has_neg_prompt = negative_prompt is not None or (
@ -811,12 +811,15 @@ class QwenImagePipeline(): #DiffusionPipeline
negative_txt_seq_lens = (
negative_prompt_embeds_mask.sum(dim=1).tolist() if negative_prompt_embeds_mask is not None else None
)
morph = False
if image_mask_latents is not None and denoising_strength <= 1.:
first_step = int(len(timesteps) * (1. - denoising_strength))
morph, first_step = False, 0
if image_mask_latents is not None:
randn = torch.randn_like(original_image_latents)
if denoising_strength < 1.:
first_step = int(len(timesteps) * (1. - denoising_strength))
if not morph:
latent_noise_factor = timesteps[first_step]/1000
latents = original_image_latents * (1.0 - latent_noise_factor) + torch.randn_like(original_image_latents) * latent_noise_factor
# latents = original_image_latents * (1.0 - latent_noise_factor) + torch.randn_like(original_image_latents) * latent_noise_factor
latents = original_image_latents * (1.0 - latent_noise_factor) + randn * latent_noise_factor
timesteps = timesteps[first_step:]
self.scheduler.timesteps = timesteps
self.scheduler.sigmas= self.scheduler.sigmas[first_step:]
@ -831,6 +834,7 @@ class QwenImagePipeline(): #DiffusionPipeline
for i, t in enumerate(timesteps):
offload.set_step_no_for_lora(self.transformer, first_step + i)
if self.interrupt:
continue
@ -905,7 +909,8 @@ class QwenImagePipeline(): #DiffusionPipeline
if image_mask_latents is not None:
next_t = timesteps[i+1] if i<len(timesteps)-1 else 0
latent_noise_factor = next_t / 1000
noisy_image = original_image_latents * (1.0 - latent_noise_factor) + torch.randn_like(original_image_latents) * latent_noise_factor
# noisy_image = original_image_latents * (1.0 - latent_noise_factor) + torch.randn_like(original_image_latents) * latent_noise_factor
noisy_image = original_image_latents * (1.0 - latent_noise_factor) + randn * latent_noise_factor
latents = noisy_image * (1-image_mask_latents) + image_mask_latents * latents
noisy_image = None

View File

@ -28,6 +28,7 @@ class family_handler():
],
"letters_filter": "KI",
}
extra_model_def["background_removal_label"]= "Remove Backgrounds only behind People / Objects except main Subject / Landscape"
return extra_model_def

View File

@ -846,7 +846,7 @@ class WanAny2V:
for i, t in enumerate(tqdm(timesteps)):
guide_scale, guidance_switch_done, trans, denoising_extra = update_guidance(i, t, guide_scale, guide2_scale, guidance_switch_done, switch_threshold, trans, 2, denoising_extra)
guide_scale, guidance_switch2_done, trans, denoising_extra = update_guidance(i, t, guide_scale, guide3_scale, guidance_switch2_done, switch2_threshold, trans, 3, denoising_extra)
offload.set_step_no_for_lora(trans, i)
offload.set_step_no_for_lora(trans, start_step_no + i)
timestep = torch.stack([t])
if timestep_injection:

View File

@ -165,6 +165,7 @@ class family_handler():
}
extra_model_def["lock_image_refs_ratios"] = True
extra_model_def["background_removal_label"]= "Remove Backgrounds behind People / Objects, keep it for Landscape or positioned Frames"
if base_model_type in ["standin"]:
extra_model_def["lock_image_refs_ratios"] = True

62
wgp.py
View File

@ -61,8 +61,8 @@ AUTOSAVE_FILENAME = "queue.zip"
PROMPT_VARS_MAX = 10
target_mmgp_version = "3.6.0"
WanGP_version = "8.5"
settings_version = 2.33
WanGP_version = "8.55"
settings_version = 2.34
max_source_video_frames = 3000
prompt_enhancer_image_caption_model, prompt_enhancer_image_caption_processor, prompt_enhancer_llm_model, prompt_enhancer_llm_tokenizer = None, None, None, None
@ -487,7 +487,6 @@ def process_prompt_and_add_tasks(state, model_choice):
image_mask = None
if "G" in video_prompt_type:
if image_mode == 0:
gr.Info(f"With Denoising Strength {denoising_strength:.1f}, denoising will start at Step no {int(num_inference_steps * (1. - denoising_strength))} ")
else:
denoising_strength = 1.0
@ -552,11 +551,13 @@ def process_prompt_and_add_tasks(state, model_choice):
if test_any_sliding_window(model_type) and image_mode == 0:
if video_length > sliding_window_size:
if model_type in ["t2v"] and not "G" in video_prompt_type :
gr.Info(f"You have requested to Generate Sliding Windows with a Text to Video model. Unless you use the Video to Video feature this is useless as a t2v model doesn't see past frames and it will generate the same video in each new window.")
return
full_video_length = video_length if video_source is None else video_length + sliding_window_overlap -1
extra = "" if full_video_length == video_length else f" including {sliding_window_overlap} added for Video Continuation"
no_windows = compute_sliding_window_no(full_video_length, sliding_window_size, sliding_window_discard_last_frames, sliding_window_overlap)
gr.Info(f"The Number of Frames to generate ({video_length}{extra}) is greater than the Sliding Window Size ({sliding_window_size}), {no_windows} Windows will be generated")
if "recam" in model_filename:
if video_guide == None:
gr.Info("You must provide a Control Video")
@ -7019,28 +7020,38 @@ def categorize_resolution(resolution_str):
return group
return "1440p"
def group_resolutions(resolutions, selected_resolution):
def group_resolutions(model_def, resolutions, selected_resolution):
model_resolutions = model_def.get("resolutions", None)
if model_resolutions is not None:
selected_group ="Locked"
available_groups = [selected_group ]
selected_group_resolutions = model_resolutions
else:
grouped_resolutions = {}
for resolution in resolutions:
group = categorize_resolution(resolution[1])
if group not in grouped_resolutions:
grouped_resolutions[group] = []
grouped_resolutions[group].append(resolution)
available_groups = [group for group in group_thresholds if group in grouped_resolutions]
grouped_resolutions = {}
for resolution in resolutions:
group = categorize_resolution(resolution[1])
if group not in grouped_resolutions:
grouped_resolutions[group] = []
grouped_resolutions[group].append(resolution)
available_groups = [group for group in group_thresholds if group in grouped_resolutions]
selected_group = categorize_resolution(selected_resolution)
selected_group_resolutions = grouped_resolutions.get(selected_group, [])
available_groups.reverse()
selected_group = categorize_resolution(selected_resolution)
selected_group_resolutions = grouped_resolutions.get(selected_group, [])
available_groups.reverse()
return available_groups, selected_group_resolutions, selected_group
def change_resolution_group(state, selected_group):
model_type = state["model_type"]
model_def = get_model_def(model_type)
model_resolutions = model_def.get("resolutions", None)
resolution_choices, _ = get_resolution_choices(None, model_resolutions)
group_resolution_choices = [ resolution for resolution in resolution_choices if categorize_resolution(resolution[1]) == selected_group ]
resolution_choices, _ = get_resolution_choices(None, model_resolutions)
if model_resolutions is None:
group_resolution_choices = [ resolution for resolution in resolution_choices if categorize_resolution(resolution[1]) == selected_group ]
else:
last_resolution = group_resolution_choices[0][1]
return gr.update(choices= group_resolution_choices, value= last_resolution)
last_resolution_per_group = state["last_resolution_per_group"]
last_resolution = last_resolution_per_group.get(selected_group, "")
@ -7051,6 +7062,11 @@ def change_resolution_group(state, selected_group):
def record_last_resolution(state, resolution):
model_type = state["model_type"]
model_def = get_model_def(model_type)
model_resolutions = model_def.get("resolutions", None)
if model_resolutions is not None: return
server_config["last_resolution_choice"] = resolution
selected_group = categorize_resolution(resolution)
last_resolution_per_group = state["last_resolution_per_group"]
@ -7482,11 +7498,13 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non
frames_positions = gr.Text(value=ui_defaults.get("frames_positions","") , visible= "F" in video_prompt_type_value, scale = 2, label= "Positions of Injected Frames separated by Spaces (1=first, no position for Objects / People)" )
image_refs_relative_size = gr.Slider(20, 100, value=ui_defaults.get("image_refs_relative_size", 50), step=1, label="Rescale Internaly Image Ref (% in relation to Output Video) to change Output Composition", visible = model_def.get("any_image_refs_relative_size", False) and image_outputs)
no_background_removal = model_def.get("no_background_removal", False)
no_background_removal = model_def.get("no_background_removal", False) or image_ref_choices is None
background_removal_label = model_def.get("background_removal_label", "Remove Backgrounds behind People / Objects")
remove_background_images_ref = gr.Dropdown(
choices=[
("Keep Backgrounds behind all Reference Images", 0),
("Remove Backgrounds only behind People / Objects except main Subject / Landscape" if (flux or qwen) else ("Remove Backgrounds behind People / Objects, keep it for Landscape or positioned Frames" if vace else "Remove Backgrounds behind People / Objects") , 1),
(background_removal_label, 1),
],
value=0 if no_background_removal else ui_defaults.get("remove_background_images_ref",1),
label="Automatic Removal of Background of People or Objects (Only)", scale = 3, visible= "I" in video_prompt_type_value and not no_background_removal
@ -7578,7 +7596,7 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non
current_resolution_choice = ui_defaults.get("resolution","832x480") if update_form or last_resolution is None else last_resolution
model_resolutions = model_def.get("resolutions", None)
resolution_choices, current_resolution_choice = get_resolution_choices(current_resolution_choice, model_resolutions)
available_groups, selected_group_resolutions, selected_group = group_resolutions(resolution_choices, current_resolution_choice)
available_groups, selected_group_resolutions, selected_group = group_resolutions(model_def,resolution_choices, current_resolution_choice)
resolution_group = gr.Dropdown(
choices = available_groups,
value= selected_group,