From f9f63cbc79c364ac146d833292f0ab89e199228c Mon Sep 17 00:00:00 2001 From: DeepBeepMeep Date: Tue, 9 Sep 2025 21:41:35 +0200 Subject: [PATCH 1/3] intermediate commit --- defaults/flux_dev_kontext.json | 2 - defaults/flux_dev_uso.json | 2 - defaults/qwen_image_edit_20B.json | 4 +- models/flux/flux_handler.py | 37 +- models/flux/flux_main.py | 69 +-- models/hyvideo/hunyuan.py | 5 - models/hyvideo/hunyuan_handler.py | 21 + models/ltx_video/ltxv.py | 3 - models/ltx_video/ltxv_handler.py | 9 + models/qwen/pipeline_qwenimage.py | 65 ++- models/qwen/qwen_handler.py | 20 +- models/qwen/qwen_main.py | 10 +- models/wan/any2video.py | 41 +- models/wan/df_handler.py | 2 +- models/wan/wan_handler.py | 74 ++- preprocessing/matanyone/app.py | 69 ++- requirements.txt | 2 +- shared/gradio/gallery.py | 141 +++-- shared/utils/utils.py | 53 +- wgp.py | 839 +++++++++++++++++------------- 20 files changed, 897 insertions(+), 571 deletions(-) diff --git a/defaults/flux_dev_kontext.json b/defaults/flux_dev_kontext.json index 8945918..20b6bc4 100644 --- a/defaults/flux_dev_kontext.json +++ b/defaults/flux_dev_kontext.json @@ -7,8 +7,6 @@ "https://huggingface.co/DeepBeepMeep/Flux/resolve/main/flux1_kontext_dev_bf16.safetensors", "https://huggingface.co/DeepBeepMeep/Flux/resolve/main/flux1_kontext_dev_quanto_bf16_int8.safetensors" ], - "image_outputs": true, - "reference_image": true, "flux-model": "flux-dev-kontext" }, "prompt": "add a hat", diff --git a/defaults/flux_dev_uso.json b/defaults/flux_dev_uso.json index 8b5dbb6..0cd7b82 100644 --- a/defaults/flux_dev_uso.json +++ b/defaults/flux_dev_uso.json @@ -6,8 +6,6 @@ "modules": [ ["https://huggingface.co/DeepBeepMeep/Flux/resolve/main/flux1-dev-USO_projector_bf16.safetensors"]], "URLs": "flux", "loras": ["https://huggingface.co/DeepBeepMeep/Flux/resolve/main/flux1-dev-USO_dit_lora_bf16.safetensors"], - "image_outputs": true, - "reference_image": true, "flux-model": "flux-dev-uso" }, "prompt": "the man is wearing a hat", diff --git a/defaults/qwen_image_edit_20B.json b/defaults/qwen_image_edit_20B.json index 2b24c72..79b8b24 100644 --- a/defaults/qwen_image_edit_20B.json +++ b/defaults/qwen_image_edit_20B.json @@ -9,9 +9,7 @@ ], "attention": { "<89": "sdpa" - }, - "reference_image": true, - "image_outputs": true + } }, "prompt": "add a hat", "resolution": "1280x720", diff --git a/models/flux/flux_handler.py b/models/flux/flux_handler.py index b8b7b9b..c468d5a 100644 --- a/models/flux/flux_handler.py +++ b/models/flux/flux_handler.py @@ -13,28 +13,41 @@ class family_handler(): flux_schnell = flux_model == "flux-schnell" flux_chroma = flux_model == "flux-chroma" flux_uso = flux_model == "flux-dev-uso" - model_def_output = { + flux_kontext = flux_model == "flux-dev-kontext" + + extra_model_def = { "image_outputs" : True, "no_negative_prompt" : not flux_chroma, } if flux_chroma: - model_def_output["guidance_max_phases"] = 1 + extra_model_def["guidance_max_phases"] = 1 elif not flux_schnell: - model_def_output["embedded_guidance"] = True + extra_model_def["embedded_guidance"] = True if flux_uso : - model_def_output["any_image_refs_relative_size"] = True - model_def_output["no_background_removal"] = True - - model_def_output["image_ref_choices"] = { + extra_model_def["any_image_refs_relative_size"] = True + extra_model_def["no_background_removal"] = True + extra_model_def["image_ref_choices"] = { "choices":[("No Reference Image", ""),("First Image is a Reference Image, and then the next ones (up to two) are Style Images", "KI"), ("Up to two Images are Style Images", "KIJ")], "default": "KI", "letters_filter": "KIJ", "label": "Reference Images / Style Images" } - model_def_output["lock_image_refs_ratios"] = True + + if flux_kontext: + extra_model_def["image_ref_choices"] = { + "choices": [ + ("None", ""), + ("Conditional Images is first Main Subject / Landscape and may be followed by People / Objects", "KI"), + ("Conditional Images are People / Objects", "I"), + ], + "letters_filter": "KI", + } - return model_def_output + + extra_model_def["lock_image_refs_ratios"] = True + + return extra_model_def @staticmethod def query_supported_types(): @@ -122,10 +135,12 @@ class family_handler(): def update_default_settings(base_model_type, model_def, ui_defaults): flux_model = model_def.get("flux-model", "flux-dev") flux_uso = flux_model == "flux-dev-uso" + flux_kontext = flux_model == "flux-dev-kontext" ui_defaults.update({ "embedded_guidance": 2.5, - }) - if model_def.get("reference_image", False): + }) + + if flux_kontext or flux_uso: ui_defaults.update({ "video_prompt_type": "KI", }) diff --git a/models/flux/flux_main.py b/models/flux/flux_main.py index 9bb8e73..4d7c67d 100644 --- a/models/flux/flux_main.py +++ b/models/flux/flux_main.py @@ -24,44 +24,6 @@ from .util import ( from PIL import Image -def resize_and_centercrop_image(image, target_height_ref1, target_width_ref1): - target_height_ref1 = int(target_height_ref1 // 64 * 64) - target_width_ref1 = int(target_width_ref1 // 64 * 64) - h, w = image.shape[-2:] - if h < target_height_ref1 or w < target_width_ref1: - # 计算长宽比 - aspect_ratio = w / h - if h < target_height_ref1: - new_h = target_height_ref1 - new_w = new_h * aspect_ratio - if new_w < target_width_ref1: - new_w = target_width_ref1 - new_h = new_w / aspect_ratio - else: - new_w = target_width_ref1 - new_h = new_w / aspect_ratio - if new_h < target_height_ref1: - new_h = target_height_ref1 - new_w = new_h * aspect_ratio - else: - aspect_ratio = w / h - tgt_aspect_ratio = target_width_ref1 / target_height_ref1 - if aspect_ratio > tgt_aspect_ratio: - new_h = target_height_ref1 - new_w = new_h * aspect_ratio - else: - new_w = target_width_ref1 - new_h = new_w / aspect_ratio - # 使用 TVF.resize 进行图像缩放 - image = TVF.resize(image, (math.ceil(new_h), math.ceil(new_w))) - # 计算中心裁剪的参数 - top = (image.shape[-2] - target_height_ref1) // 2 - left = (image.shape[-1] - target_width_ref1) // 2 - # 使用 TVF.crop 进行中心裁剪 - image = TVF.crop(image, top, left, target_height_ref1, target_width_ref1) - return image - - def stitch_images(img1, img2): # Resize img2 to match img1's height width1, height1 = img1.size @@ -171,8 +133,6 @@ class model_factory: device="cuda" flux_dev_uso = self.name in ['flux-dev-uso'] image_stiching = not self.name in ['flux-dev-uso'] #and False - # image_refs_relative_size = 100 - crop = False input_ref_images = [] if input_ref_images is None else input_ref_images[:] ref_style_imgs = [] if "I" in video_prompt_type and len(input_ref_images) > 0: @@ -186,36 +146,15 @@ class model_factory: if image_stiching: # image stiching method stiched = input_ref_images[0] - if "K" in video_prompt_type : - w, h = input_ref_images[0].size - height, width = calculate_new_dimensions(height, width, h, w, fit_into_canvas) - # actual rescale will happen in prepare_kontext for new_img in input_ref_images[1:]: stiched = stitch_images(stiched, new_img) input_ref_images = [stiched] else: - first_ref = 0 - if "K" in video_prompt_type: - # image latents tiling method - w, h = input_ref_images[0].size - if crop : - img = convert_image_to_tensor(input_ref_images[0]) - img = resize_and_centercrop_image(img, height, width) - input_ref_images[0] = convert_tensor_to_image(img) - else: - height, width = calculate_new_dimensions(height, width, h, w, fit_into_canvas) - input_ref_images[0] = input_ref_images[0].resize((width, height), resample=Image.Resampling.LANCZOS) - first_ref = 1 - - for i in range(first_ref,len(input_ref_images)): + # latents stiching with resize + for i in range(len(input_ref_images)): w, h = input_ref_images[i].size - if crop: - img = convert_image_to_tensor(input_ref_images[i]) - img = resize_and_centercrop_image(img, int(height*image_refs_relative_size/100), int(width*image_refs_relative_size/100)) - input_ref_images[i] = convert_tensor_to_image(img) - else: - image_height, image_width = calculate_new_dimensions(int(height*image_refs_relative_size/100), int(width*image_refs_relative_size/100), h, w, fit_into_canvas) - input_ref_images[i] = input_ref_images[i].resize((image_width, image_height), resample=Image.Resampling.LANCZOS) + image_height, image_width = calculate_new_dimensions(int(height*image_refs_relative_size/100), int(width*image_refs_relative_size/100), h, w, fit_into_canvas) + input_ref_images[i] = input_ref_images[i].resize((image_width, image_height), resample=Image.Resampling.LANCZOS) else: input_ref_images = None diff --git a/models/hyvideo/hunyuan.py b/models/hyvideo/hunyuan.py index a38a7bd..181a9a7 100644 --- a/models/hyvideo/hunyuan.py +++ b/models/hyvideo/hunyuan.py @@ -861,11 +861,6 @@ class HunyuanVideoSampler(Inference): freqs_cos, freqs_sin = self.get_rotary_pos_embed(target_frame_num, target_height, target_width, enable_RIFLEx) else: if self.avatar: - w, h = input_ref_images.size - target_height, target_width = calculate_new_dimensions(target_height, target_width, h, w, fit_into_canvas) - if target_width != w or target_height != h: - input_ref_images = input_ref_images.resize((target_width,target_height), resample=Image.Resampling.LANCZOS) - concat_dict = {'mode': 'timecat', 'bias': -1} freqs_cos, freqs_sin = self.get_rotary_pos_embed_new(129, target_height, target_width, concat_dict) else: diff --git a/models/hyvideo/hunyuan_handler.py b/models/hyvideo/hunyuan_handler.py index 9cbaea7..487e76d 100644 --- a/models/hyvideo/hunyuan_handler.py +++ b/models/hyvideo/hunyuan_handler.py @@ -51,6 +51,23 @@ class family_handler(): extra_model_def["tea_cache"] = True extra_model_def["mag_cache"] = True + if base_model_type in ["hunyuan_custom_edit"]: + extra_model_def["guide_preprocessing"] = { + "selection": ["MV", "PMV"], + } + + extra_model_def["mask_preprocessing"] = { + "selection": ["A", "NA"], + "default" : "NA" + } + + if base_model_type in ["hunyuan_custom_audio", "hunyuan_custom_edit", "hunyuan_custom"]: + extra_model_def["image_ref_choices"] = { + "choices": [("Reference Image", "I")], + "letters_filter":"I", + "visible": False, + } + if base_model_type in ["hunyuan_avatar"]: extra_model_def["no_background_removal"] = True if base_model_type in ["hunyuan_custom", "hunyuan_custom_edit", "hunyuan_custom_audio", "hunyuan_avatar"]: @@ -141,6 +158,10 @@ class family_handler(): return hunyuan_model, pipe + @staticmethod + def fix_settings(base_model_type, settings_version, model_def, ui_defaults): + pass + @staticmethod def update_default_settings(base_model_type, model_def, ui_defaults): ui_defaults["embedded_guidance_scale"]= 6.0 diff --git a/models/ltx_video/ltxv.py b/models/ltx_video/ltxv.py index e71ac4f..db143fc 100644 --- a/models/ltx_video/ltxv.py +++ b/models/ltx_video/ltxv.py @@ -300,9 +300,6 @@ class LTXV: prefix_size, height, width = input_video.shape[-3:] else: if image_start != None: - frame_width, frame_height = image_start.size - if fit_into_canvas != None: - height, width = calculate_new_dimensions(height, width, frame_height, frame_width, fit_into_canvas, 32) conditioning_media_paths.append(image_start.unsqueeze(1)) conditioning_start_frames.append(0) conditioning_control_frames.append(False) diff --git a/models/ltx_video/ltxv_handler.py b/models/ltx_video/ltxv_handler.py index c89b69a..e44c983 100644 --- a/models/ltx_video/ltxv_handler.py +++ b/models/ltx_video/ltxv_handler.py @@ -26,6 +26,15 @@ class family_handler(): extra_model_def["sliding_window"] = True extra_model_def["image_prompt_types_allowed"] = "TSEV" + extra_model_def["guide_preprocessing"] = { + "selection": ["", "PV", "DV", "EV", "V"], + "labels" : { "V": "Use LTXV raw format"} + } + + extra_model_def["mask_preprocessing"] = { + "selection": ["", "A", "NA", "XA", "XNA"], + } + return extra_model_def @staticmethod diff --git a/models/qwen/pipeline_qwenimage.py b/models/qwen/pipeline_qwenimage.py index 07bdbd4..905b864 100644 --- a/models/qwen/pipeline_qwenimage.py +++ b/models/qwen/pipeline_qwenimage.py @@ -28,7 +28,7 @@ from transformers import Qwen2_5_VLForConditionalGeneration, Qwen2Tokenizer, Aut from .autoencoder_kl_qwenimage import AutoencoderKLQwenImage from diffusers import FlowMatchEulerDiscreteScheduler from PIL import Image -from shared.utils.utils import calculate_new_dimensions +from shared.utils.utils import calculate_new_dimensions, convert_image_to_tensor, convert_tensor_to_image XLA_AVAILABLE = False @@ -563,6 +563,8 @@ class QwenImagePipeline(): #DiffusionPipeline callback_on_step_end_tensor_inputs: List[str] = ["latents"], max_sequence_length: int = 512, image = None, + image_mask = None, + denoising_strength = 0, callback=None, pipeline=None, loras_slists=None, @@ -694,14 +696,33 @@ class QwenImagePipeline(): #DiffusionPipeline image_width = image_width // multiple_of * multiple_of image_height = image_height // multiple_of * multiple_of ref_height, ref_width = 1568, 672 - if height * width < ref_height * ref_width: ref_height , ref_width = height , width - if image_height * image_width > ref_height * ref_width: - image_height, image_width = calculate_new_dimensions(ref_height, ref_width, image_height, image_width, False, block_size=multiple_of) - image = image.resize((image_width,image_height), resample=Image.Resampling.LANCZOS) + if image_mask is None: + if height * width < ref_height * ref_width: ref_height , ref_width = height , width + if image_height * image_width > ref_height * ref_width: + image_height, image_width = calculate_new_dimensions(ref_height, ref_width, image_height, image_width, False, block_size=multiple_of) + if (image_width,image_height) != image.size: + image = image.resize((image_width,image_height), resample=Image.Resampling.LANCZOS) + image_mask_latents = None + else: + # _, image_width, image_height = min( + # (abs(aspect_ratio - w / h), w, h) for w, h in PREFERRED_QWENIMAGE_RESOLUTIONS + # ) + image_height, image_width = calculate_new_dimensions(height, width, image_height, image_width, False, block_size=multiple_of) + # image_height, image_width = calculate_new_dimensions(ref_height, ref_width, image_height, image_width, False, block_size=multiple_of) + height, width = image_height, image_width + image_mask_latents = convert_image_to_tensor(image_mask.resize((width // 16, height // 16), resample=Image.Resampling.LANCZOS)) + image_mask_latents = torch.where(image_mask_latents>-0.5, 1., 0. )[0:1] + image_mask_rebuilt = image_mask_latents.repeat_interleave(16, dim=-1).repeat_interleave(16, dim=-2).unsqueeze(0) + convert_tensor_to_image( image_mask_rebuilt.squeeze(0).repeat(3,1,1)).save("mmm.png") + image_mask_latents = image_mask_latents.reshape(1, -1, 1).to(device) + prompt_image = image - image = self.image_processor.preprocess(image, image_height, image_width) - image = image.unsqueeze(2) + if image.size != (image_width, image_height): + image = image.resize((image_width, image_height), resample=Image.Resampling.LANCZOS) + + image.save("nnn.png") + image = convert_image_to_tensor(image).unsqueeze(0).unsqueeze(2) has_neg_prompt = negative_prompt is not None or ( negative_prompt_embeds is not None and negative_prompt_embeds_mask is not None @@ -744,6 +765,8 @@ class QwenImagePipeline(): #DiffusionPipeline generator, latents, ) + original_image_latents = None if image_latents is None else image_latents.clone() + if image is not None: img_shapes = [ [ @@ -788,6 +811,15 @@ class QwenImagePipeline(): #DiffusionPipeline negative_txt_seq_lens = ( negative_prompt_embeds_mask.sum(dim=1).tolist() if negative_prompt_embeds_mask is not None else None ) + morph = False + if image_mask_latents is not None and denoising_strength <= 1.: + first_step = int(len(timesteps) * (1. - denoising_strength)) + if not morph: + latent_noise_factor = timesteps[first_step]/1000 + latents = original_image_latents * (1.0 - latent_noise_factor) + torch.randn_like(original_image_latents) * latent_noise_factor + timesteps = timesteps[first_step:] + self.scheduler.timesteps = timesteps + self.scheduler.sigmas= self.scheduler.sigmas[first_step:] # 6. Denoising loop self.scheduler.set_begin_index(0) @@ -797,10 +829,15 @@ class QwenImagePipeline(): #DiffusionPipeline update_loras_slists(self.transformer, loras_slists, updated_num_steps) callback(-1, None, True, override_num_inference_steps = updated_num_steps) + for i, t in enumerate(timesteps): if self.interrupt: continue + if image_mask_latents is not None and denoising_strength <1. and i == first_step and morph: + latent_noise_factor = t/1000 + latents = original_image_latents * (1.0 - latent_noise_factor) + latents * latent_noise_factor + self._current_timestep = t # broadcast to batch dimension in a way that's compatible with ONNX/Core ML timestep = t.expand(latents.shape[0]).to(latents.dtype) @@ -865,6 +902,12 @@ class QwenImagePipeline(): #DiffusionPipeline # compute the previous noisy sample x_t -> x_t-1 latents_dtype = latents.dtype latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0] + if image_mask_latents is not None: + next_t = timesteps[i+1] if i append at end insert_pos = min(sel, len(cur) -1) cur_clean = cur # Build final list and selection @@ -330,6 +329,8 @@ class AdvancedMediaGallery: st["items"] = merged st["selected"] = new_sel + record_last_action(st,"add") + # print(f"gallery add: set sel {new_sel}") return gr.update(value=merged, selected_index=new_sel), st def _on_remove(self, state: Dict[str, Any], gallery) : @@ -342,8 +343,9 @@ class AdvancedMediaGallery: return gr.update(value=[], selected_index=None), st new_sel = min(sel, len(items) - 1) st["items"] = items; st["selected"] = new_sel - # return gr.update(value=items, selected_index=new_sel), st - return gr.update(value=items), st + record_last_action(st,"remove") + # print(f"gallery del: new sel {new_sel}") + return gr.update(value=items, selected_index=new_sel), st def _on_move(self, delta: int, state: Dict[str, Any], gallery) : st = get_state(state); items: List[Any] = get_list(gallery); sel = st.get("selected", None) @@ -354,11 +356,15 @@ class AdvancedMediaGallery: return gr.update(value=items, selected_index=sel), st items[sel], items[j] = items[j], items[sel] st["items"] = items; st["selected"] = j + record_last_action(st,"move") + # print(f"gallery move: set sel {j}") return gr.update(value=items, selected_index=j), st def _on_clear(self, state: Dict[str, Any]) : st = {"items": [], "selected": None, "single": get_state(state).get("single", False), "mode": self.media_mode} - return gr.update(value=[], selected_index=0), st + record_last_action(st,"clear") + # print(f"Clear all") + return gr.update(value=[], selected_index=None), st def _on_toggle_single(self, to_single: bool, state: Dict[str, Any]) : st = get_state(state); st["single"] = bool(to_single) @@ -382,30 +388,38 @@ class AdvancedMediaGallery: def mount(self, parent: Optional[gr.Blocks | gr.Group | gr.Row | gr.Column] = None, update_form = False): if parent is not None: with parent: - col = self._build_ui() + col = self._build_ui(update_form) else: - col = self._build_ui() + col = self._build_ui(update_form) if not update_form: self._wire_events() return col - def _build_ui(self) -> gr.Column: + def _build_ui(self, update = False) -> gr.Column: with gr.Column(elem_id=self.elem_id, elem_classes=self.elem_classes) as col: self.container = col self.state = gr.State(dict(self._initial_state)) - self.gallery = gr.Gallery( - label=self.label, - value=self._initial_state["items"], - height=self.height, - columns=self.columns, - show_label=self.show_label, - preview= True, - # type="pil", - file_types= list(IMAGE_EXTS) if self.media_mode == "image" else list(VIDEO_EXTS), - selected_index=self._initial_state["selected"], # server-side selection - ) + if update: + self.gallery = gr.update( + value=self._initial_state["items"], + selected_index=self._initial_state["selected"], # server-side selection + label=self.label, + show_label=self.show_label, + ) + else: + self.gallery = gr.Gallery( + value=self._initial_state["items"], + label=self.label, + height=self.height, + columns=self.columns, + show_label=self.show_label, + preview= True, + # type="pil", # very slow + file_types= list(IMAGE_EXTS) if self.media_mode == "image" else list(VIDEO_EXTS), + selected_index=self._initial_state["selected"], # server-side selection + ) # One-line controls exts = sorted(IMAGE_EXTS if self.media_mode == "image" else VIDEO_EXTS) if self.accept_filter else None @@ -418,10 +432,10 @@ class AdvancedMediaGallery: size="sm", min_width=1, ) - self.btn_remove = gr.Button("Remove", size="sm", min_width=1) + self.btn_remove = gr.Button(" Remove ", size="sm", min_width=1) self.btn_left = gr.Button("◀ Left", size="sm", visible=not self._initial_state["single"], min_width=1) self.btn_right = gr.Button("Right ▶", size="sm", visible=not self._initial_state["single"], min_width=1) - self.btn_clear = gr.Button("Clear", variant="secondary", size="sm", visible=not self._initial_state["single"], min_width=1) + self.btn_clear = gr.Button(" Clear ", variant="secondary", size="sm", visible=not self._initial_state["single"], min_width=1) return col @@ -430,14 +444,24 @@ class AdvancedMediaGallery: self.gallery.select( self._on_select, inputs=[self.state, self.gallery], - outputs=[self.state], + outputs=[self.gallery, self.state], + trigger_mode="always_last", ) # Gallery value changed by user actions (click-to-add, drag-drop, internal remove, etc.) - self.gallery.change( + self.gallery.upload( + self._on_upload, + inputs=[self.gallery, self.state], + outputs=[self.gallery, self.state], + trigger_mode="always_last", + ) + + # Gallery value changed by user actions (click-to-add, drag-drop, internal remove, etc.) + self.gallery.upload( self._on_gallery_change, inputs=[self.gallery, self.state], outputs=[self.gallery, self.state], + trigger_mode="always_last", ) # Add via UploadButton @@ -445,6 +469,7 @@ class AdvancedMediaGallery: self._on_add, inputs=[self.upload_btn, self.state, self.gallery], outputs=[self.gallery, self.state], + trigger_mode="always_last", ) # Remove selected @@ -452,6 +477,7 @@ class AdvancedMediaGallery: self._on_remove, inputs=[self.state, self.gallery], outputs=[self.gallery, self.state], + trigger_mode="always_last", ) # Reorder using selected index, keep same item selected @@ -459,11 +485,13 @@ class AdvancedMediaGallery: lambda st, gallery: self._on_move(-1, st, gallery), inputs=[self.state, self.gallery], outputs=[self.gallery, self.state], + trigger_mode="always_last", ) self.btn_right.click( lambda st, gallery: self._on_move(+1, st, gallery), inputs=[self.state, self.gallery], outputs=[self.gallery, self.state], + trigger_mode="always_last", ) # Clear all @@ -471,6 +499,7 @@ class AdvancedMediaGallery: self._on_clear, inputs=[self.state], outputs=[self.gallery, self.state], + trigger_mode="always_last", ) # ---------------- public API ---------------- diff --git a/shared/utils/utils.py b/shared/utils/utils.py index 7ddf1eb..5dbd0af 100644 --- a/shared/utils/utils.py +++ b/shared/utils/utils.py @@ -19,6 +19,7 @@ import tempfile import subprocess import json from functools import lru_cache +os.environ["U2NET_HOME"] = os.path.join(os.getcwd(), "ckpts", "rembg") from PIL import Image @@ -207,30 +208,62 @@ def get_outpainting_frame_location(final_height, final_width, outpainting_dims if (margin_left + width) > final_width or outpainting_right == 0: margin_left = final_width - width return height, width, margin_top, margin_left -def calculate_new_dimensions(canvas_height, canvas_width, image_height, image_width, fit_into_canvas, block_size = 16): - if fit_into_canvas == None: +def rescale_and_crop(img, w, h): + ow, oh = img.size + target_ratio = w / h + orig_ratio = ow / oh + + if orig_ratio > target_ratio: + # Crop width first + nw = int(oh * target_ratio) + img = img.crop(((ow - nw) // 2, 0, (ow + nw) // 2, oh)) + else: + # Crop height first + nh = int(ow / target_ratio) + img = img.crop((0, (oh - nh) // 2, ow, (oh + nh) // 2)) + + return img.resize((w, h), Image.LANCZOS) + +def calculate_new_dimensions(canvas_height, canvas_width, image_height, image_width, fit_into_canvas, block_size = 16): + if fit_into_canvas == None or fit_into_canvas == 2: # return image_height, image_width return canvas_height, canvas_width - if fit_into_canvas: + if fit_into_canvas == 1: scale1 = min(canvas_height / image_height, canvas_width / image_width) scale2 = min(canvas_width / image_height, canvas_height / image_width) scale = max(scale1, scale2) - else: + else: #0 or #2 (crop) scale = (canvas_height * canvas_width / (image_height * image_width))**(1/2) new_height = round( image_height * scale / block_size) * block_size new_width = round( image_width * scale / block_size) * block_size return new_height, new_width -def resize_and_remove_background(img_list, budget_width, budget_height, rm_background, ignore_first, fit_into_canvas = False ): +def calculate_dimensions_and_resize_image(image, canvas_height, canvas_width, fit_into_canvas, fit_crop, block_size = 16): + if fit_crop: + image = rescale_and_crop(image, canvas_width, canvas_height) + new_width, new_height = image.size + else: + image_width, image_height = image.size + new_height, new_width = calculate_new_dimensions(canvas_height, canvas_width, image_height, image_width, fit_into_canvas, block_size = block_size ) + image = image.resize((new_width, new_height), resample=Image.Resampling.LANCZOS) + return image, new_height, new_width + +def resize_and_remove_background(img_list, budget_width, budget_height, rm_background, any_background_ref, fit_into_canvas = 0, block_size= 16, outpainting_dims = None ): if rm_background: session = new_session() output_list =[] for i, img in enumerate(img_list): width, height = img.size - - if fit_into_canvas: + if fit_into_canvas == None or any_background_ref == 1 and i==0 or any_background_ref == 2: + if outpainting_dims is not None: + resized_image =img + elif img.size != (budget_width, budget_height): + resized_image= img.resize((budget_width, budget_height), resample=Image.Resampling.LANCZOS) + else: + resized_image =img + elif fit_into_canvas == 1: white_canvas = np.ones((budget_height, budget_width, 3), dtype=np.uint8) * 255 scale = min(budget_height / height, budget_width / width) new_height = int(height * scale) @@ -242,10 +275,10 @@ def resize_and_remove_background(img_list, budget_width, budget_height, rm_backg resized_image = Image.fromarray(white_canvas) else: scale = (budget_height * budget_width / (height * width))**(1/2) - new_height = int( round(height * scale / 16) * 16) - new_width = int( round(width * scale / 16) * 16) + new_height = int( round(height * scale / block_size) * block_size) + new_width = int( round(width * scale / block_size) * block_size) resized_image= img.resize((new_width,new_height), resample=Image.Resampling.LANCZOS) - if rm_background and not (ignore_first and i == 0) : + if rm_background and not (any_background_ref and i==0 or any_background_ref == 2) : # resized_image = remove(resized_image, session=session, alpha_matting_erode_size = 1,alpha_matting_background_threshold = 70, alpha_foreground_background_threshold = 100, alpha_matting = True, bgcolor=[255, 255, 255, 0]).convert('RGB') resized_image = remove(resized_image, session=session, alpha_matting_erode_size = 1, alpha_matting = True, bgcolor=[255, 255, 255, 0]).convert('RGB') output_list.append(resized_image) #alpha_matting_background_threshold = 30, alpha_foreground_background_threshold = 200, diff --git a/wgp.py b/wgp.py index 396e273..45cb5cf 100644 --- a/wgp.py +++ b/wgp.py @@ -1,4 +1,5 @@ import os +os.environ["GRADIO_LANG"] = "en" # # os.environ.pop("TORCH_LOGS", None) # make sure no env var is suppressing/overriding # os.environ["TORCH_LOGS"]= "recompiles" import torch._logging as tlog @@ -21,7 +22,7 @@ import numpy as np import importlib from shared.utils import notification_sound from shared.utils.loras_mutipliers import preparse_loras_multipliers, parse_loras_multipliers -from shared.utils.utils import convert_tensor_to_image, save_image, get_video_info, get_file_creation_date, convert_image_to_video, calculate_new_dimensions, convert_image_to_tensor, get_video_frame +from shared.utils.utils import convert_tensor_to_image, save_image, get_video_info, get_file_creation_date, convert_image_to_video, calculate_new_dimensions, convert_image_to_tensor, calculate_dimensions_and_resize_image, rescale_and_crop, get_video_frame, resize_and_remove_background from shared.utils.audio_video import extract_audio_tracks, combine_video_with_audio_tracks, combine_and_concatenate_video_with_audio_tracks, cleanup_temp_audio_files, save_video, save_image from shared.utils.audio_video import save_image_metadata, read_image_metadata from shared.match_archi import match_nvidia_architecture @@ -61,7 +62,7 @@ PROMPT_VARS_MAX = 10 target_mmgp_version = "3.6.0" WanGP_version = "8.4" -settings_version = 2.31 +settings_version = 2.33 max_source_video_frames = 3000 prompt_enhancer_image_caption_model, prompt_enhancer_image_caption_processor, prompt_enhancer_llm_model, prompt_enhancer_llm_tokenizer = None, None, None, None @@ -220,7 +221,7 @@ def process_prompt_and_add_tasks(state, model_choice): return get_queue_table(queue) model_def = get_model_def(model_type) model_handler = get_model_handler(model_type) - image_outputs = inputs["image_mode"] == 1 + image_outputs = inputs["image_mode"] > 0 any_steps_skipping = model_def.get("tea_cache", False) or model_def.get("mag_cache", False) model_type = get_base_model_type(model_type) inputs["model_filename"] = model_filename @@ -370,7 +371,7 @@ def process_prompt_and_add_tasks(state, model_choice): gr.Info("Mag Cache maximum number of steps is 50") return - if image_mode == 1: + if image_mode > 0: audio_prompt_type = "" if "B" in audio_prompt_type or "X" in audio_prompt_type: @@ -477,7 +478,8 @@ def process_prompt_and_add_tasks(state, model_choice): image_mask = None if "G" in video_prompt_type: - gr.Info(f"With Denoising Strength {denoising_strength:.1f}, denoising will start at Step no {int(num_inference_steps * (1. - denoising_strength))} ") + if image_mode == 0: + gr.Info(f"With Denoising Strength {denoising_strength:.1f}, denoising will start at Step no {int(num_inference_steps * (1. - denoising_strength))} ") else: denoising_strength = 1.0 if len(keep_frames_video_guide) > 0 and model_type in ["ltxv_13B"]: @@ -1334,9 +1336,11 @@ def update_queue_data(queue): data = get_queue_table(queue) if len(data) == 0: - return gr.DataFrame(visible=False) + return gr.DataFrame(value=[], max_height=1) + elif len(data) == 1: + return gr.DataFrame(value=data, max_height= 83) else: - return gr.DataFrame(value=data, visible= True) + return gr.DataFrame(value=data, max_height= 1000) def create_html_progress_bar(percentage=0.0, text="Idle", is_idle=True): bar_class = "progress-bar-custom idle" if is_idle else "progress-bar-custom" @@ -1371,6 +1375,12 @@ def _parse_args(): help="save proprocessed audio track with extract speakers for debugging or editing" ) + parser.add_argument( + "--debug-gen-form", + action="store_true", + help="View form generation / refresh time" + ) + parser.add_argument( "--vram-safety-coefficient", type=float, @@ -2070,8 +2080,6 @@ def fix_settings(model_type, ui_defaults): if image_prompt_type != None : if not isinstance(image_prompt_type, str): image_prompt_type = "S" if image_prompt_type == 0 else "SE" - # if model_type == "flf2v_720p" and not "E" in image_prompt_type: - # image_prompt_type = "SE" if settings_version <= 2: image_prompt_type = image_prompt_type.replace("G","") ui_defaults["image_prompt_type"] = image_prompt_type @@ -2091,10 +2099,7 @@ def fix_settings(model_type, ui_defaults): video_prompt_type = ui_defaults.get("video_prompt_type", "") - any_reference_image = model_def.get("reference_image", False) - if base_model_type in ["hunyuan_custom", "hunyuan_custom_edit", "hunyuan_custom_audio", "hunyuan_avatar", "phantom_14B", "phantom_1.3B"] or any_reference_image: - if not "I" in video_prompt_type: # workaround for settings corruption - video_prompt_type += "I" + if base_model_type in ["hunyuan"]: video_prompt_type = video_prompt_type.replace("I", "") @@ -2133,10 +2138,27 @@ def fix_settings(model_type, ui_defaults): del ui_defaults["tea_cache_start_step_perc"] ui_defaults["skip_steps_start_step_perc"] = tea_cache_start_step_perc + image_prompt_type = ui_defaults.get("image_prompt_type", "") + if len(image_prompt_type) > 0: + image_prompt_types_allowed = model_def.get("image_prompt_types_allowed","") + image_prompt_type = filter_letters(image_prompt_type, image_prompt_types_allowed) + ui_defaults["image_prompt_type"] = image_prompt_type + + video_prompt_type = ui_defaults.get("video_prompt_type", "") + image_ref_choices_list = model_def.get("image_ref_choices", {}).get("choices", []) + if len(image_ref_choices_list)==0: + video_prompt_type = del_in_sequence(video_prompt_type, "IK") + else: + first_choice = image_ref_choices_list[0][1] + if "I" in first_choice and not "I" in video_prompt_type: video_prompt_type += "I" + if len(image_ref_choices_list)==1 and "K" in first_choice and not "K" in video_prompt_type: video_prompt_type += "K" + ui_defaults["video_prompt_type"] = video_prompt_type + model_handler = get_model_handler(base_model_type) if hasattr(model_handler, "fix_settings"): model_handler.fix_settings(base_model_type, settings_version, model_def, ui_defaults) + def get_default_settings(model_type): def get_default_prompt(i2v): if i2v: @@ -3323,8 +3345,8 @@ def select_video(state, input_file_list, event_data: gr.EventData): if not all_letters(src, pos): return False if neg is not None and any_letters(src, neg): return False return True - image_outputs = configs.get("image_mode",0) == 1 - map_video_prompt = {"V" : "Control Video", ("VA", "U") : "Mask Video", "I" : "Reference Images"} + image_outputs = configs.get("image_mode",0) > 0 + map_video_prompt = {"V" : "Control Image" if image_outputs else "Control Video", ("VA", "U") : "Mask Image" if image_outputs else "Mask Video", "I" : "Reference Images"} map_image_prompt = {"V" : "Source Video", "L" : "Last Video", "S" : "Start Image", "E" : "End Image"} map_audio_prompt = {"A" : "Audio Source", "B" : "Audio Source #2"} video_other_prompts = [ v for s,v in map_image_prompt.items() if all_letters(video_image_prompt_type,s)] \ @@ -3571,9 +3593,27 @@ def process_images_multithread(image_processor, items, process_type, wrap_in_lis end_time = time.time() # print(f"duration:{end_time-start_time:.1f}") - return results + return results -def preprocess_video_with_mask(input_video_path, input_mask_path, height, width, max_frames, start_frame=0, fit_canvas = False, target_fps = 16, block_size= 16, expand_scale = 2, process_type = "inpaint", process_type2 = None, to_bbox = False, RGB_Mask = False, negate_mask = False, process_outside_mask = None, inpaint_color = 127, outpainting_dims = None, proc_no = 1): +def preprocess_image_with_mask(input_image, input_mask, height, width, fit_canvas = False, block_size= 16, expand_scale = 2): + frame_width, frame_height = input_image.size + + if fit_canvas != None: + height, width = calculate_new_dimensions(height, width, frame_height, frame_width, fit_into_canvas = fit_canvas, block_size = block_size) + input_image = input_image.resize((width, height), resample=Image.Resampling.LANCZOS) + if input_mask is not None: + input_mask = input_mask.resize((width, height), resample=Image.Resampling.LANCZOS) + + if expand_scale != 0 and input_mask is not None: + kernel_size = abs(expand_scale) + kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (kernel_size, kernel_size)) + op_expand = cv2.dilate if expand_scale > 0 else cv2.erode + input_mask = np.array(input_mask) + input_mask = op_expand(input_mask, kernel, iterations=3) + input_mask = Image.fromarray(input_mask) + return input_image, input_mask + +def preprocess_video_with_mask(input_video_path, input_mask_path, height, width, max_frames, start_frame=0, fit_canvas = None, fit_crop = False, target_fps = 16, block_size= 16, expand_scale = 2, process_type = "inpaint", process_type2 = None, to_bbox = False, RGB_Mask = False, negate_mask = False, process_outside_mask = None, inpaint_color = 127, outpainting_dims = None, proc_no = 1): from shared.utils.utils import calculate_new_dimensions, get_outpainting_frame_location, get_outpainting_full_area_dimensions def mask_to_xyxy_box(mask): @@ -3615,6 +3655,9 @@ def preprocess_video_with_mask(input_video_path, input_mask_path, height, width, if len(video) == 0 or any_mask and len(mask_video) == 0: return None, None + if fit_crop and outpainting_dims != None: + fit_crop = False + fit_canvas = 0 if fit_canvas is not None else None frame_height, frame_width, _ = video[0].shape @@ -3629,7 +3672,7 @@ def preprocess_video_with_mask(input_video_path, input_mask_path, height, width, if outpainting_dims != None: final_height, final_width = height, width - height, width, margin_top, margin_left = get_outpainting_frame_location(final_height, final_width, outpainting_dims, 8) + height, width, margin_top, margin_left = get_outpainting_frame_location(final_height, final_width, outpainting_dims, 1) if any_mask: num_frames = min(len(video), len(mask_video)) @@ -3646,14 +3689,20 @@ def preprocess_video_with_mask(input_video_path, input_mask_path, height, width, # for frame_idx in range(num_frames): def prep_prephase(frame_idx): frame = Image.fromarray(video[frame_idx].cpu().numpy()) #.asnumpy() - frame = frame.resize((width, height), resample=Image.Resampling.LANCZOS) + if fit_crop: + frame = rescale_and_crop(frame, width, height) + else: + frame = frame.resize((width, height), resample=Image.Resampling.LANCZOS) frame = np.array(frame) if any_mask: if any_identity_mask: mask = np.full( (height, width, 3), 0, dtype= np.uint8) else: mask = Image.fromarray(mask_video[frame_idx].cpu().numpy()) #.asnumpy() - mask = mask.resize((width, height), resample=Image.Resampling.LANCZOS) + if fit_crop: + mask = rescale_and_crop(mask, width, height) + else: + mask = mask.resize((width, height), resample=Image.Resampling.LANCZOS) mask = np.array(mask) if len(mask.shape) == 3 and mask.shape[2] == 3: @@ -3750,14 +3799,14 @@ def preprocess_video_with_mask(input_video_path, input_mask_path, height, width, return torch.stack(masked_frames), torch.stack(masks) if any_mask else None -def preprocess_video(height, width, video_in, max_frames, start_frame=0, fit_canvas = None, target_fps = 16, block_size = 16): +def preprocess_video(height, width, video_in, max_frames, start_frame=0, fit_canvas = None, fit_crop = False, target_fps = 16, block_size = 16): frames_list = get_resampled_video(video_in, start_frame, max_frames, target_fps) if len(frames_list) == 0: return None - if fit_canvas == None: + if fit_canvas == None or fit_crop: new_height = height new_width = width else: @@ -3775,7 +3824,10 @@ def preprocess_video(height, width, video_in, max_frames, start_frame=0, fit_can processed_frames_list = [] for frame in frames_list: frame = Image.fromarray(np.clip(frame.cpu().numpy(), 0, 255).astype(np.uint8)) - frame = frame.resize((new_width,new_height), resample=Image.Resampling.LANCZOS) + if fit_crop: + frame = rescale_and_crop(frame, new_width, new_height) + else: + frame = frame.resize((new_width,new_height), resample=Image.Resampling.LANCZOS) processed_frames_list.append(frame) np_frames = [np.array(frame) for frame in processed_frames_list] @@ -4115,9 +4167,10 @@ def process_prompt_enhancer(prompt_enhancer, original_prompts, image_start, ori prompt_images = [] if "I" in prompt_enhancer: if image_start != None: - prompt_images.append(image_start) + prompt_images += image_start if original_image_refs != None: - prompt_images += original_image_refs[:1] + prompt_images += original_image_refs[:1] + prompt_images = [Image.open(img) if isinstance(img,str) else img for img in prompt_images] if len(original_prompts) == 0 and not "T" in prompt_enhancer: return None else: @@ -4223,7 +4276,7 @@ def enhance_prompt(state, prompt, prompt_enhancer, multi_images_gen_type, overri original_image_refs = inputs["image_refs"] if original_image_refs is not None: original_image_refs = [ convert_image(tup[0]) for tup in original_image_refs ] - is_image = inputs["image_mode"] == 1 + is_image = inputs["image_mode"] > 0 seed = inputs["seed"] seed = set_seed(seed) enhanced_prompts = [] @@ -4367,7 +4420,7 @@ def generate_video( model_def = get_model_def(model_type) - is_image = image_mode == 1 + is_image = image_mode > 0 if is_image: if min_frames_if_references >= 1000: video_length = min_frames_if_references - 1000 @@ -4377,19 +4430,22 @@ def generate_video( batch_size = 1 temp_filenames_list = [] - if image_guide is not None and isinstance(image_guide, Image.Image): - video_guide = convert_image_to_video(image_guide) - temp_filenames_list.append(video_guide) - image_guide = None + convert_image_guide_to_video = model_def.get("convert_image_guide_to_video", False) + if convert_image_guide_to_video: + if image_guide is not None and isinstance(image_guide, Image.Image): + video_guide = convert_image_to_video(image_guide) + temp_filenames_list.append(video_guide) + image_guide = None - if image_mask is not None and isinstance(image_mask, Image.Image): - video_mask = convert_image_to_video(image_mask) - temp_filenames_list.append(video_mask) - image_mask = None + if image_mask is not None and isinstance(image_mask, Image.Image): + video_mask = convert_image_to_video(image_mask) + temp_filenames_list.append(video_mask) + image_mask = None + if model_def.get("no_background_removal", False): remove_background_images_ref = 0 + base_model_type = get_base_model_type(model_type) model_family = get_model_family(base_model_type) - fit_canvas = server_config.get("fit_canvas", 0) model_handler = get_model_handler(base_model_type) block_size = model_handler.get_vae_block_size(base_model_type) if hasattr(model_handler, "get_vae_block_size") else 16 @@ -4415,7 +4471,7 @@ def generate_video( return width, height = resolution.split("x") - width, height = int(width), int(height) + width, height = int(width) // block_size * block_size, int(height) // block_size * block_size default_image_size = (height, width) if slg_switch == 0: @@ -4530,39 +4586,25 @@ def generate_video( original_image_refs = image_refs # image_refs = None # nb_frames_positions= 0 - frames_to_inject = [] - any_background_ref = False - outpainting_dims = None if video_guide_outpainting== None or len(video_guide_outpainting) == 0 or video_guide_outpainting == "0 0 0 0" or video_guide_outpainting.startswith("#") else [int(v) for v in video_guide_outpainting.split(" ")] # Output Video Ratio Priorities: # Source Video or Start Image > Control Video > Image Ref (background or positioned frames only) > UI Width, Height # Image Ref (non background and non positioned frames) are boxed in a white canvas in order to keep their own width/height ratio - - if image_refs is not None and len(image_refs) > 0: + frames_to_inject = [] + if image_refs is not None: frames_positions_list = [ int(pos)-1 for pos in frames_positions.split(" ")] if frames_positions is not None and len(frames_positions)> 0 else [] frames_positions_list = frames_positions_list[:len(image_refs)] nb_frames_positions = len(frames_positions_list) - if nb_frames_positions > 0: - frames_to_inject = [None] * (max(frames_positions_list) + 1) - for i, pos in enumerate(frames_positions_list): - frames_to_inject[pos] = image_refs[i] - if video_guide == None and video_source == None and not "L" in image_prompt_type and (nb_frames_positions > 0 or "K" in video_prompt_type) : - from shared.utils.utils import get_outpainting_full_area_dimensions - w, h = image_refs[0].size - if outpainting_dims != None: - h, w = get_outpainting_full_area_dimensions(h,w, outpainting_dims) - default_image_size = calculate_new_dimensions(height, width, h, w, fit_canvas) - fit_canvas = None - # if there is a source video and a background image ref, the height/width ratio will need to be processed later by the code for the model (we dont know the source video dimensions at this point) - if len(image_refs) > nb_frames_positions: - any_background_ref = "K" in video_prompt_type - if remove_background_images_ref > 0: - send_cmd("progress", [0, get_latest_status(state, "Removing Images References Background")]) - os.environ["U2NET_HOME"] = os.path.join(os.getcwd(), "ckpts", "rembg") - from shared.utils.utils import resize_and_remove_background - # keep image ratios if there is a background image ref (we will let the model preprocessor decide what to do) but remove bg if requested - image_refs[nb_frames_positions:] = resize_and_remove_background(image_refs[nb_frames_positions:] , width, height, remove_background_images_ref > 0, any_background_ref, fit_into_canvas= not (any_background_ref or model_def.get("lock_image_refs_ratios", False)) ) # no fit for vace ref images as it is done later - update_task_thumbnails(task, locals()) - send_cmd("output") + any_background_ref = 0 + if "K" in video_prompt_type: + any_background_ref = 2 if model_def.get("all_image_refs_are_background_ref", False) else 1 + + outpainting_dims = None if video_guide_outpainting== None or len(video_guide_outpainting) == 0 or video_guide_outpainting == "0 0 0 0" or video_guide_outpainting.startswith("#") else [int(v) for v in video_guide_outpainting.split(" ")] + fit_canvas = server_config.get("fit_canvas", 0) + fit_crop = fit_canvas == 2 + if fit_crop and outpainting_dims is not None: + fit_crop = False + fit_canvas = 0 + joint_pass = boost ==1 #and profile != 1 and profile != 3 skip_steps_cache = None if len(skip_steps_cache_type) == 0 else DynamicClass(cache_type = skip_steps_cache_type) @@ -4632,6 +4674,9 @@ def generate_video( length = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) current_video_length = min(current_video_length, length) + if image_guide is not None: + image_guide, image_mask = preprocess_image_with_mask(image_guide, image_mask, height, width, fit_canvas = None, block_size= block_size, expand_scale = mask_expand) + seed = set_seed(seed) torch.set_grad_enabled(False) @@ -4723,22 +4768,22 @@ def generate_video( return_latent_slice = None if reuse_frames > 0: return_latent_slice = slice(-(reuse_frames - 1 + discard_last_frames ) // latent_size - 1, None if discard_last_frames == 0 else -(discard_last_frames // latent_size) ) - refresh_preview = {"image_guide" : None, "image_mask" : None} + refresh_preview = {"image_guide" : image_guide, "image_mask" : image_mask} if image_mode >= 1 else {} src_ref_images = image_refs image_start_tensor = image_end_tensor = None if window_no == 1 and (video_source is not None or image_start is not None): if image_start is not None: - new_height, new_width = calculate_new_dimensions(height, width, image_start.height, image_start.width, sample_fit_canvas, block_size = block_size) - image_start_tensor = image_start.resize((new_width, new_height), resample=Image.Resampling.LANCZOS) + image_start_tensor, new_height, new_width = calculate_dimensions_and_resize_image(image_start, height, width, sample_fit_canvas, fit_crop, block_size = block_size) + if fit_crop: refresh_preview["image_start"] = image_start_tensor image_start_tensor = convert_image_to_tensor(image_start_tensor) pre_video_guide = prefix_video = image_start_tensor.unsqueeze(1) else: - if "L" in image_prompt_type: - refresh_preview["video_source"] = get_video_frame(video_source, 0) - prefix_video = preprocess_video(width=width, height=height,video_in=video_source, max_frames= parsed_keep_frames_video_source , start_frame = 0, fit_canvas= sample_fit_canvas, target_fps = fps, block_size = block_size ) + prefix_video = preprocess_video(width=width, height=height,video_in=video_source, max_frames= parsed_keep_frames_video_source , start_frame = 0, fit_canvas= sample_fit_canvas, fit_crop = fit_crop, target_fps = fps, block_size = block_size ) prefix_video = prefix_video.permute(3, 0, 1, 2) prefix_video = prefix_video.float().div_(127.5).sub_(1.) # c, f, h, w + if fit_crop or "L" in image_prompt_type: refresh_preview["video_source"] = convert_tensor_to_image(prefix_video, 0) + new_height, new_width = prefix_video.shape[-2:] pre_video_guide = prefix_video[:, -reuse_frames:] pre_video_frame = convert_tensor_to_image(prefix_video[:, -1]) @@ -4752,10 +4797,11 @@ def generate_video( image_end_list= image_end if isinstance(image_end, list) else [image_end] if len(image_end_list) >= window_no: new_height, new_width = image_size - image_end_tensor =image_end_list[window_no-1].resize((new_width, new_height), resample=Image.Resampling.LANCZOS) + image_end_tensor, _, _ = calculate_dimensions_and_resize_image(image_end_list[window_no-1], new_height, new_width, sample_fit_canvas, fit_crop, block_size = block_size) + # image_end_tensor =image_end_list[window_no-1].resize((new_width, new_height), resample=Image.Resampling.LANCZOS) + refresh_preview["image_end"] = image_end_tensor image_end_tensor = convert_image_to_tensor(image_end_tensor) image_end_list= None - window_start_frame = guide_start_frame - (reuse_frames if window_no > 1 else source_video_overlap_frames_count) guide_end_frame = guide_start_frame + current_video_length - (source_video_overlap_frames_count if window_no == 1 else reuse_frames) alignment_shift = source_video_frames_count if reset_control_aligment else 0 @@ -4768,7 +4814,8 @@ def generate_video( from models.wan.multitalk.multitalk import get_window_audio_embeddings # special treatment for start frame pos when alignement to first frame requested as otherwise the start frame number will be negative due to overlapped frames (has been previously compensated later with padding) audio_proj_split = get_window_audio_embeddings(audio_proj_full, audio_start_idx= aligned_window_start_frame + (source_video_overlap_frames_count if reset_control_aligment else 0 ), clip_length = current_video_length) - + if vace: + video_guide_processed = video_mask_processed = video_guide_processed2 = video_mask_processed2 = None if video_guide is not None: keep_frames_parsed, error = parse_keep_frames_video_guide(keep_frames_video_guide, source_video_frames_count -source_video_overlap_frames_count + requested_frames_to_generate) @@ -4776,12 +4823,44 @@ def generate_video( raise gr.Error(f"invalid keep frames {keep_frames_video_guide}") keep_frames_parsed = keep_frames_parsed[aligned_guide_start_frame: aligned_guide_end_frame ] - if ltxv: + if vace: + context_scale = [ control_net_weight] + if "V" in video_prompt_type: + process_outside_mask = process_map_outside_mask.get(filter_letters(video_prompt_type, "YWX"), None) + preprocess_type, preprocess_type2 = "raw", None + for process_num, process_letter in enumerate( filter_letters(video_prompt_type, "PDSLCMU")): + if process_num == 0: + preprocess_type = process_map_video_guide.get(process_letter, "raw") + else: + preprocess_type2 = process_map_video_guide.get(process_letter, None) + status_info = "Extracting " + processes_names[preprocess_type] + extra_process_list = ([] if preprocess_type2==None else [preprocess_type2]) + ([] if process_outside_mask==None or process_outside_mask == preprocess_type else [process_outside_mask]) + if len(extra_process_list) == 1: + status_info += " and " + processes_names[extra_process_list[0]] + elif len(extra_process_list) == 2: + status_info += ", " + processes_names[extra_process_list[0]] + " and " + processes_names[extra_process_list[1]] + if preprocess_type2 is not None: + context_scale = [ control_net_weight /2, control_net_weight2 /2] + send_cmd("progress", [0, get_latest_status(state, status_info)]) + video_guide_processed, video_mask_processed = preprocess_video_with_mask(video_guide, video_mask, height=image_size[0], width = image_size[1], max_frames= len(keep_frames_parsed) , start_frame = aligned_guide_start_frame, fit_canvas = sample_fit_canvas, fit_crop = fit_crop, target_fps = fps, process_type = preprocess_type, expand_scale = mask_expand, RGB_Mask = True, negate_mask = "N" in video_prompt_type, process_outside_mask = process_outside_mask, outpainting_dims = outpainting_dims, proc_no =1 ) + if preprocess_type2 != None: + video_guide_processed2, video_mask_processed2 = preprocess_video_with_mask(video_guide, video_mask, height=image_size[0], width = image_size[1], max_frames= len(keep_frames_parsed), start_frame = aligned_guide_start_frame, fit_canvas = sample_fit_canvas, fit_crop = fit_crop, target_fps = fps, process_type = preprocess_type2, expand_scale = mask_expand, RGB_Mask = True, negate_mask = "N" in video_prompt_type, process_outside_mask = process_outside_mask, outpainting_dims = outpainting_dims, proc_no =2 ) + + if video_guide_processed != None: + if sample_fit_canvas != None: + image_size = video_guide_processed.shape[-3: -1] + sample_fit_canvas = None + refresh_preview["video_guide"] = Image.fromarray(video_guide_processed[0].cpu().numpy()) + if video_guide_processed2 != None: + refresh_preview["video_guide"] = [refresh_preview["video_guide"], Image.fromarray(video_guide_processed2[0].cpu().numpy())] + if video_mask_processed != None: + refresh_preview["video_mask"] = Image.fromarray(video_mask_processed[0].cpu().numpy()) + elif ltxv: preprocess_type = process_map_video_guide.get(filter_letters(video_prompt_type, "PED"), "raw") status_info = "Extracting " + processes_names[preprocess_type] send_cmd("progress", [0, get_latest_status(state, status_info)]) # start one frame ealier to facilitate latents merging later - src_video, _ = preprocess_video_with_mask(video_guide, video_mask, height=image_size[0], width = image_size[1], max_frames= len(keep_frames_parsed) + (0 if aligned_guide_start_frame == 0 else 1), start_frame = aligned_guide_start_frame - (0 if aligned_guide_start_frame == 0 else 1), fit_canvas = sample_fit_canvas, target_fps = fps, process_type = preprocess_type, inpaint_color = 0, proc_no =1, negate_mask = "N" in video_prompt_type, process_outside_mask = "inpaint" if "X" in video_prompt_type else "identity", block_size =block_size ) + src_video, _ = preprocess_video_with_mask(video_guide, video_mask, height=image_size[0], width = image_size[1], max_frames= len(keep_frames_parsed) + (0 if aligned_guide_start_frame == 0 else 1), start_frame = aligned_guide_start_frame - (0 if aligned_guide_start_frame == 0 else 1), fit_canvas = sample_fit_canvas, fit_crop = fit_crop, target_fps = fps, process_type = preprocess_type, inpaint_color = 0, proc_no =1, negate_mask = "N" in video_prompt_type, process_outside_mask = "inpaint" if "X" in video_prompt_type else "identity", block_size =block_size ) if src_video != None: src_video = src_video[ :(len(src_video)-1)// latent_size * latent_size +1 ] refresh_preview["video_guide"] = Image.fromarray(src_video[0].cpu().numpy()) @@ -4798,15 +4877,14 @@ def generate_video( progress_args = [0, get_latest_status(state,"Extracting Video and Mask")] send_cmd("progress", progress_args) - src_video, src_mask = preprocess_video_with_mask(video_guide, video_mask, height=height, width = width, max_frames= current_video_length if window_no == 1 else current_video_length - reuse_frames, start_frame = guide_start_frame, fit_canvas = sample_fit_canvas, target_fps = fps, process_type= "pose" if "P" in video_prompt_type else "inpaint", negate_mask = "N" in video_prompt_type, inpaint_color =0) + src_video, src_mask = preprocess_video_with_mask(video_guide, video_mask, height=height, width = width, max_frames= current_video_length if window_no == 1 else current_video_length - reuse_frames, start_frame = guide_start_frame, fit_canvas = sample_fit_canvas, fit_crop = fit_crop, target_fps = fps, process_type= "pose" if "P" in video_prompt_type else "inpaint", negate_mask = "N" in video_prompt_type, inpaint_color =0) refresh_preview["video_guide"] = Image.fromarray(src_video[0].cpu().numpy()) if src_mask != None: refresh_preview["video_mask"] = Image.fromarray(src_mask[0].cpu().numpy()) elif "R" in video_prompt_type: # sparse video to video src_image = get_video_frame(video_guide, aligned_guide_start_frame, return_last_if_missing = True, return_PIL = True) - new_height, new_width = calculate_new_dimensions(image_size[0], image_size[1], src_image.height, src_image.width, sample_fit_canvas, block_size = block_size) - src_image = src_image.resize((new_width, new_height), resample=Image.Resampling.LANCZOS) + src_image, _, _ = calculate_dimensions_and_resize_image(src_image, new_height, new_width, sample_fit_canvas, fit_crop, block_size = block_size) refresh_preview["video_guide"] = src_image src_video = convert_image_to_tensor(src_image).unsqueeze(1) if sample_fit_canvas != None: @@ -4814,7 +4892,7 @@ def generate_video( sample_fit_canvas = None else: # video to video - video_guide_processed = preprocess_video(width = image_size[1], height=image_size[0], video_in=video_guide, max_frames= len(keep_frames_parsed), start_frame = aligned_guide_start_frame, fit_canvas= sample_fit_canvas, target_fps = fps) + video_guide_processed = preprocess_video(width = image_size[1], height=image_size[0], video_in=video_guide, max_frames= len(keep_frames_parsed), start_frame = aligned_guide_start_frame, fit_canvas= sample_fit_canvas, fit_crop = fit_crop, target_fps = fps) if video_guide_processed is None: src_video = pre_video_guide else: @@ -4824,42 +4902,54 @@ def generate_video( src_video = video_guide_processed.float().div_(127.5).sub_(1.).permute(-1,0,1,2) if pre_video_guide != None: src_video = torch.cat( [pre_video_guide, src_video], dim=1) + elif image_guide is not None: + image_guide, new_height, new_width = calculate_dimensions_and_resize_image(image_guide, height, width, sample_fit_canvas, fit_crop, block_size = block_size) + image_size = (new_height, new_width) + refresh_preview["image_guide"] = image_guide + sample_fit_canvas = None + if image_mask is not None: + image_mask, _, _ = calculate_dimensions_and_resize_image(image_mask, new_height, new_width, sample_fit_canvas, fit_crop, block_size = block_size) + refresh_preview["image_mask"] = image_mask + + if window_no == 1 and image_refs is not None and len(image_refs) > 0: + if sample_fit_canvas is not None and (nb_frames_positions > 0 or "K" in video_prompt_type) : + from shared.utils.utils import get_outpainting_full_area_dimensions + w, h = image_refs[0].size + if outpainting_dims != None: + h, w = get_outpainting_full_area_dimensions(h,w, outpainting_dims) + image_size = calculate_new_dimensions(height, width, h, w, fit_canvas) + sample_fit_canvas = None + if repeat_no == 1: + if fit_crop: + if any_background_ref == 2: + end_ref_position = len(image_refs) + elif any_background_ref == 1: + end_ref_position = nb_frames_positions + 1 + else: + end_ref_position = nb_frames_positions + for i, img in enumerate(image_refs[:end_ref_position]): + image_refs[i] = rescale_and_crop(img, default_image_size[1], default_image_size[0]) + refresh_preview["image_refs"] = image_refs + + if len(image_refs) > nb_frames_positions: + if remove_background_images_ref > 0: + send_cmd("progress", [0, get_latest_status(state, "Removing Images References Background")]) + # keep image ratios if there is a background image ref (we will let the model preprocessor decide what to do) but remove bg if requested + image_refs[nb_frames_positions:] = resize_and_remove_background(image_refs[nb_frames_positions:] , image_size[1], image_size[0], + remove_background_images_ref > 0, any_background_ref, + fit_into_canvas= 0 if (any_background_ref > 0 or model_def.get("lock_image_refs_ratios", False)) else 1, + block_size=block_size, + outpainting_dims =outpainting_dims ) + refresh_preview["image_refs"] = image_refs + + if nb_frames_positions > 0: + frames_to_inject = [None] * (max(frames_positions_list) + 1) + for i, pos in enumerate(frames_positions_list): + frames_to_inject[pos] = image_refs[i] if vace : - image_refs_copy = image_refs[nb_frames_positions:].copy() if image_refs != None and len(image_refs) > nb_frames_positions else None # required since prepare_source do inplace modifications - context_scale = [ control_net_weight] - video_guide_processed = video_mask_processed = video_guide_processed2 = video_mask_processed2 = None - if "V" in video_prompt_type: - process_outside_mask = process_map_outside_mask.get(filter_letters(video_prompt_type, "YWX"), None) - preprocess_type, preprocess_type2 = "raw", None - for process_num, process_letter in enumerate( filter_letters(video_prompt_type, "PDSLCMU")): - if process_num == 0: - preprocess_type = process_map_video_guide.get(process_letter, "raw") - else: - preprocess_type2 = process_map_video_guide.get(process_letter, None) - status_info = "Extracting " + processes_names[preprocess_type] - extra_process_list = ([] if preprocess_type2==None else [preprocess_type2]) + ([] if process_outside_mask==None or process_outside_mask == preprocess_type else [process_outside_mask]) - if len(extra_process_list) == 1: - status_info += " and " + processes_names[extra_process_list[0]] - elif len(extra_process_list) == 2: - status_info += ", " + processes_names[extra_process_list[0]] + " and " + processes_names[extra_process_list[1]] - if preprocess_type2 is not None: - context_scale = [ control_net_weight /2, control_net_weight2 /2] - send_cmd("progress", [0, get_latest_status(state, status_info)]) - video_guide_processed, video_mask_processed = preprocess_video_with_mask(video_guide, video_mask, height=image_size[0], width = image_size[1], max_frames= len(keep_frames_parsed) , start_frame = aligned_guide_start_frame, fit_canvas = sample_fit_canvas, target_fps = fps, process_type = preprocess_type, expand_scale = mask_expand, RGB_Mask = True, negate_mask = "N" in video_prompt_type, process_outside_mask = process_outside_mask, outpainting_dims = outpainting_dims, proc_no =1 ) - if preprocess_type2 != None: - video_guide_processed2, video_mask_processed2 = preprocess_video_with_mask(video_guide, video_mask, height=image_size[0], width = image_size[1], max_frames= len(keep_frames_parsed), start_frame = aligned_guide_start_frame, fit_canvas = sample_fit_canvas, target_fps = fps, process_type = preprocess_type2, expand_scale = mask_expand, RGB_Mask = True, negate_mask = "N" in video_prompt_type, process_outside_mask = process_outside_mask, outpainting_dims = outpainting_dims, proc_no =2 ) - - if video_guide_processed != None: - if sample_fit_canvas != None: - image_size = video_guide_processed.shape[-3: -1] - sample_fit_canvas = None - refresh_preview["video_guide"] = Image.fromarray(video_guide_processed[0].cpu().numpy()) - if video_guide_processed2 != None: - refresh_preview["video_guide"] = [refresh_preview["video_guide"], Image.fromarray(video_guide_processed2[0].cpu().numpy())] - if video_mask_processed != None: - refresh_preview["video_mask"] = Image.fromarray(video_mask_processed[0].cpu().numpy()) frames_to_inject_parsed = frames_to_inject[aligned_guide_start_frame: aligned_guide_end_frame] + image_refs_copy = image_refs[nb_frames_positions:].copy() if image_refs != None and len(image_refs) > nb_frames_positions else None # required since prepare_source do inplace modifications src_video, src_mask, src_ref_images = wan_model.prepare_source([video_guide_processed] if video_guide_processed2 == None else [video_guide_processed, video_guide_processed2], [video_mask_processed] if video_guide_processed2 == None else [video_mask_processed, video_mask_processed2], @@ -4868,7 +4958,6 @@ def generate_video( keep_video_guide_frames=keep_frames_parsed, start_frame = aligned_guide_start_frame, pre_src_video = [pre_video_guide] if video_guide_processed2 == None else [pre_video_guide, pre_video_guide], - fit_into_canvas = sample_fit_canvas, inject_frames= frames_to_inject_parsed, outpainting_dims = outpainting_dims, any_background_ref = any_background_ref @@ -4931,9 +5020,9 @@ def generate_video( prefix_frames_count = source_video_overlap_frames_count if window_no <= 1 else reuse_frames, frame_num= (current_video_length // latent_size)* latent_size + 1, batch_size = batch_size, - height = height, - width = width, - fit_into_canvas = fit_canvas == 1, + height = image_size[0], + width = image_size[1], + fit_into_canvas = fit_canvas, shift=flow_shift, sample_solver=sample_solver, sampling_steps=num_inference_steps, @@ -4990,6 +5079,8 @@ def generate_video( pre_video_frame = pre_video_frame, original_input_ref_images = original_image_refs[nb_frames_positions:] if original_image_refs is not None else [], image_refs_relative_size = image_refs_relative_size, + image_guide= image_guide, + image_mask= image_mask, ) except Exception as e: if len(control_audio_tracks) > 0 or len(source_audio_tracks) > 0: @@ -5931,7 +6022,7 @@ def prepare_inputs_dict(target, inputs, model_type = None, model_filename = None if target == "settings": return inputs - image_outputs = inputs.get("image_mode",0) == 1 + image_outputs = inputs.get("image_mode",0) > 0 pop=[] if "force_fps" in inputs and len(inputs["force_fps"])== 0: @@ -5947,13 +6038,13 @@ def prepare_inputs_dict(target, inputs, model_type = None, model_filename = None pop += ["MMAudio_setting", "MMAudio_prompt", "MMAudio_neg_prompt"] video_prompt_type = inputs["video_prompt_type"] - if not base_model_type in ["t2v"]: + if not "G" in video_prompt_type: pop += ["denoising_strength"] if not (server_config.get("enhancer_enabled", 0) > 0 and server_config.get("enhancer_mode", 0) == 0): pop += ["prompt_enhancer"] - if not recammaster and not diffusion_forcing and not flux: + if model_def.get("model_modes", None) is None: pop += ["model_mode"] if not vace and not phantom and not hunyuan_video_custom: @@ -6075,6 +6166,18 @@ def image_to_ref_image_set(state, input_file_list, choice, target, target_name): gr.Info(f"Selected Image was copied to {target_name}") return file_list[choice] +def image_to_ref_image_guide(state, input_file_list, choice): + file_list, file_settings_list = get_file_list(state, input_file_list) + if len(file_list) == 0 or choice == None or choice < 0 or choice > len(file_list): return gr.update(), gr.update() + ui_settings = get_current_model_settings(state) + gr.Info(f"Selected Image was copied to Control Image") + new_image = file_list[choice] + if ui_settings["image_mode"]==2: + return new_image, new_image + else: + return new_image, None + + def apply_post_processing(state, input_file_list, choice, PP_temporal_upsampling, PP_spatial_upsampling, PP_film_grain_intensity, PP_film_grain_saturation): gen = get_gen_info(state) @@ -6142,11 +6245,11 @@ def eject_video_from_gallery(state, input_file_list, choice): return gr.Gallery(value = file_list, selected_index= choice), gr.update() if len(file_list) >0 else get_default_video_info(), gr.Row(visible= len(file_list) > 0) def has_video_file_extension(filename): - extension = os.path.splitext(filename)[-1] + extension = os.path.splitext(filename)[-1].lower() return extension in [".mp4"] def has_image_file_extension(filename): - extension = os.path.splitext(filename)[-1] + extension = os.path.splitext(filename)[-1].lower() return extension in [".png", ".jpg", ".jpeg", ".bmp", ".gif", ".webp", ".tif", ".tiff", ".jfif", ".pjpeg"] def add_videos_to_gallery(state, input_file_list, choice, files_to_load): gen = get_gen_info(state) @@ -6308,7 +6411,7 @@ def get_settings_from_file(state, file_path, allow_json, merge_with_defaults, sw return configs, any_image_or_video def record_image_mode_tab(state, evt:gr.SelectData): - state["image_mode_tab"] = 0 if evt.index ==0 else 1 + state["image_mode_tab"] = evt.index def switch_image_mode(state): image_mode = state.get("image_mode_tab", 0) @@ -6316,7 +6419,18 @@ def switch_image_mode(state): ui_defaults = get_model_settings(state, model_type) ui_defaults["image_mode"] = image_mode - + video_prompt_type = ui_defaults.get("video_prompt_type", "") + model_def = get_model_def( model_type) + inpaint_support = model_def.get("inpaint_support", False) + if inpaint_support: + if image_mode == 1: + video_prompt_type = del_in_sequence(video_prompt_type, "VAG") + video_prompt_type = add_to_sequence(video_prompt_type, "KI") + elif image_mode == 2: + video_prompt_type = add_to_sequence(video_prompt_type, "VAG") + video_prompt_type = del_in_sequence(video_prompt_type, "KI") + ui_defaults["video_prompt_type"] = video_prompt_type + return str(time.time()) def load_settings_from_file(state, file_path): @@ -6349,6 +6463,7 @@ def load_settings_from_file(state, file_path): def save_inputs( target, + image_mask_guide, lset_name, image_mode, prompt, @@ -6434,13 +6549,18 @@ def save_inputs( state, ): - - # if state.get("validate_success",0) != 1: - # return + model_filename = state["model_filename"] model_type = state["model_type"] + if image_mask_guide is not None and image_mode == 2: + if "background" in image_mask_guide: + image_guide = image_mask_guide["background"] + if "layers" in image_mask_guide and len(image_mask_guide["layers"])>0: + image_mask = image_mask_guide["layers"][0] + image_mask_guide = None inputs = get_function_arguments(save_inputs, locals()) inputs.pop("target") + inputs.pop("image_mask_guide") cleaned_inputs = prepare_inputs_dict(target, inputs) if target == "settings": defaults_filename = get_settings_file_name(model_type) @@ -6544,11 +6664,16 @@ def change_model(state, model_choice): return header -def fill_inputs(state): +def get_current_model_settings(state): model_type = state["model_type"] - ui_defaults = get_model_settings(state, model_type) + ui_defaults = get_model_settings(state, model_type) if ui_defaults == None: ui_defaults = get_default_settings(model_type) + set_model_settings(state, model_type, ui_defaults) + return ui_defaults + +def fill_inputs(state): + ui_defaults = get_current_model_settings(state) return generate_video_tab(update_form = True, state_dict = state, ui_defaults = ui_defaults) @@ -6623,7 +6748,9 @@ def refresh_image_prompt_type_radio(state, image_prompt_type, image_prompt_type_ image_prompt_type = del_in_sequence(image_prompt_type, "VLTS") image_prompt_type = add_to_sequence(image_prompt_type, image_prompt_type_radio) any_video_source = len(filter_letters(image_prompt_type, "VL"))>0 - end_visible = any_letters(image_prompt_type, "SVL") + model_def = get_model_def(state["model_type"]) + image_prompt_types_allowed = model_def.get("image_prompt_types_allowed", "") + end_visible = "E" in image_prompt_types_allowed and any_letters(image_prompt_type, "SVL") return image_prompt_type, gr.update(visible = "S" in image_prompt_type ), gr.update(visible = end_visible and ("E" in image_prompt_type) ), gr.update(visible = "V" in image_prompt_type) , gr.update(visible = any_video_source), gr.update(visible = end_visible) def refresh_image_prompt_type_endcheckbox(state, image_prompt_type, image_prompt_type_radio, end_checkbox): @@ -6654,7 +6781,7 @@ def refresh_video_prompt_type_video_mask(state, video_prompt_type, video_prompt_ visible= "A" in video_prompt_type model_type = state["model_type"] model_def = get_model_def(model_type) - image_outputs = image_mode == 1 + image_outputs = image_mode > 0 return video_prompt_type, gr.update(visible= visible and not image_outputs), gr.update(visible= visible and image_outputs), gr.update(visible= visible ) def refresh_video_prompt_type_alignment(state, video_prompt_type, video_prompt_type_video_guide): @@ -6663,20 +6790,22 @@ def refresh_video_prompt_type_alignment(state, video_prompt_type, video_prompt_t return video_prompt_type def refresh_video_prompt_type_video_guide(state, video_prompt_type, video_prompt_type_video_guide, image_mode): - video_prompt_type = del_in_sequence(video_prompt_type, "PDESLCMGUV") + video_prompt_type = del_in_sequence(video_prompt_type, "PDESLCMUV") video_prompt_type = add_to_sequence(video_prompt_type, video_prompt_type_video_guide) visible = "V" in video_prompt_type model_type = state["model_type"] base_model_type = get_base_model_type(model_type) mask_visible = visible and "A" in video_prompt_type and not "U" in video_prompt_type model_def = get_model_def(model_type) - image_outputs = image_mode == 1 + image_outputs = image_mode > 0 vace= test_vace_module(model_type) keep_frames_video_guide_visible = not image_outputs and visible and not model_def.get("keep_frames_video_guide_not_supported", False) return video_prompt_type, gr.update(visible = visible and not image_outputs), gr.update(visible = visible and image_outputs), gr.update(visible = keep_frames_video_guide_visible), gr.update(visible = visible and "G" in video_prompt_type), gr.update(visible= (visible or "F" in video_prompt_type or "K" in video_prompt_type) and vace), gr.update(visible= visible and not "U" in video_prompt_type ), gr.update(visible= mask_visible and not image_outputs), gr.update(visible= mask_visible and image_outputs), gr.update(visible= mask_visible) def refresh_video_prompt_type_video_guide_alt(state, video_prompt_type, video_prompt_type_video_guide_alt): - video_prompt_type = del_in_sequence(video_prompt_type, "RGUVQKI") + model_def = get_model_def(state["model_type"]) + guide_custom_choices = model_def.get("guide_custom_choices",{}) + video_prompt_type = del_in_sequence(video_prompt_type, guide_custom_choices.get("letters_filter","")) video_prompt_type = add_to_sequence(video_prompt_type, video_prompt_type_video_guide_alt) control_video_visible = "V" in video_prompt_type ref_images_visible = "I" in video_prompt_type @@ -6711,7 +6840,7 @@ def get_image_end_label(multi_prompts_gen_type): return "Images as ending points for new Videos in the Generation Queue" if multi_prompts_gen_type == 0 else "Images as ending points for each new Window of the same Video Generation" def refresh_prompt_labels(multi_prompts_gen_type, image_mode): - prompt_label, wizard_prompt_label = get_prompt_labels(multi_prompts_gen_type, image_mode == 1) + prompt_label, wizard_prompt_label = get_prompt_labels(multi_prompts_gen_type, image_mode > 0) return gr.update(label=prompt_label), gr.update(label = wizard_prompt_label), gr.update(label=get_image_end_label(multi_prompts_gen_type)) def show_preview_column_modal(state, column_no): @@ -7032,7 +7161,6 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non diffusion_forcing = "diffusion_forcing" in model_filename ltxv = "ltxv" in model_filename lock_inference_steps = model_def.get("lock_inference_steps", False) - model_reference_image = model_def.get("reference_image", False) any_tea_cache = model_def.get("tea_cache", False) any_mag_cache = model_def.get("mag_cache", False) recammaster = base_model_type in ["recam_1.3B"] @@ -7075,18 +7203,22 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non if not v2i_switch_supported and not image_outputs: image_mode_value = 0 else: - image_outputs = image_mode_value == 1 + image_outputs = image_mode_value > 0 + inpaint_support = model_def.get("inpaint_support", False) image_mode = gr.Number(value =image_mode_value, visible = False) - - with gr.Tabs(visible = v2i_switch_supported, selected= "t2i" if image_mode_value == 1 else "t2v" ) as image_mode_tabs: - with gr.Tab("Text to Video", id = "t2v", elem_classes="compact_tab"): + image_mode_tab_selected= "t2i" if image_mode_value == 1 else ("inpaint" if image_mode_value == 2 else "t2v") + with gr.Tabs(visible = v2i_switch_supported or inpaint_support, selected= image_mode_tab_selected ) as image_mode_tabs: + with gr.Tab("Text to Video", id = "t2v", elem_classes="compact_tab", visible = v2i_switch_supported) as tab_t2v: pass with gr.Tab("Text to Image", id = "t2i", elem_classes="compact_tab"): pass + with gr.Tab("Image Inpainting", id = "inpaint", elem_classes="compact_tab", visible=inpaint_support) as tab_inpaint: + pass image_prompt_types_allowed = model_def.get("image_prompt_types_allowed", "") model_mode_choices = model_def.get("model_modes", None) with gr.Column(visible= len(image_prompt_types_allowed)> 0 or model_mode_choices is not None) as image_prompt_column: + # Video Continue / Start Frame / End Frame image_prompt_type_value= ui_defaults.get("image_prompt_type","") image_prompt_type = gr.Text(value= image_prompt_type_value, visible= False) image_prompt_type_choices = [] @@ -7123,192 +7255,167 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non model_mode = gr.Dropdown(choices=model_mode_choices["choices"], value=ui_defaults.get("model_mode", model_mode_choices["default"]), label=model_mode_choices["label"], visible=True) keep_frames_video_source = gr.Text(value=ui_defaults.get("keep_frames_video_source","") , visible= len(filter_letters(image_prompt_type_value, "VL"))>0 , scale = 2, label= "Truncate Video beyond this number of resampled Frames (empty=Keep All, negative truncates from End)" ) - with gr.Column(visible= vace or phantom or hunyuan_video_custom or hunyuan_video_avatar or hunyuan_video_custom_edit or t2v or standin or ltxv or infinitetalk or recammaster or flux and model_reference_image or qwen and model_reference_image) as video_prompt_column: + any_control_video = any_control_image = False + guide_preprocessing = model_def.get("guide_preprocessing", None) + mask_preprocessing = model_def.get("mask_preprocessing", None) + guide_custom_choices = model_def.get("guide_custom_choices", None) + image_ref_choices = model_def.get("image_ref_choices", None) + + # with gr.Column(visible= vace or phantom or hunyuan_video_custom or hunyuan_video_avatar or hunyuan_video_custom_edit or t2v or standin or ltxv or infinitetalk or recammaster or (flux or qwen ) and model_reference_image and image_mode_value >=1) as video_prompt_column: + with gr.Column(visible= guide_preprocessing is not None or mask_preprocessing is not None or guide_custom_choices is not None or image_ref_choices is not None) as video_prompt_column: video_prompt_type_value= ui_defaults.get("video_prompt_type","") video_prompt_type = gr.Text(value= video_prompt_type_value, visible= False) - any_control_video = True - any_control_image = image_outputs - with gr.Row(): - if t2v: - video_prompt_type_video_guide = gr.Dropdown( - choices=[ - ("Use Text Prompt Only", ""), - ("Image to Image guided by Text Prompt" if image_outputs else "Video to Video guided by Text Prompt", "GUV"), - ], - value=filter_letters(video_prompt_type_value, "GUV"), - label="Video to Video", scale = 2, show_label= False, visible= True - ) - elif vace : + with gr.Row(visible = image_mode_value!=2) as guide_selection_row: + # Control Video Preprocessing + if guide_preprocessing is None: + video_prompt_type_video_guide = gr.Dropdown(choices=[("","")], value="", label="Control Video", scale = 2, visible= False, show_label= True, ) + else: pose_label = "Pose" if image_outputs else "Motion" - video_prompt_type_video_guide = gr.Dropdown( - choices=[ - ("No Control Image" if image_outputs else "No Control Video", ""), - ("Keep Control Image Unchanged" if image_outputs else "Keep Control Video Unchanged", "UV"), - (f"Transfer Human {pose_label}" , "PV"), - ("Transfer Depth", "DV"), - ("Transfer Shapes", "SV"), - ("Transfer Flow", "LV"), - ("Recolorize", "CV"), - ("Perform Inpainting", "MV"), - ("Use Vace raw format", "V"), - (f"Transfer Human {pose_label} & Depth", "PDV"), - (f"Transfer Human {pose_label} & Shapes", "PSV"), - (f"Transfer Human {pose_label} & Flow", "PLV"), - ("Transfer Depth & Shapes", "DSV"), - ("Transfer Depth & Flow", "DLV"), - ("Transfer Shapes & Flow", "SLV"), - ], - value=filter_letters(video_prompt_type_value, "PDSLCMGUV"), - label="Control Image Process" if image_outputs else "Control Video Process", scale = 2, visible= True, show_label= True, - ) - elif ltxv: - video_prompt_type_video_guide = gr.Dropdown( - choices=[ - ("No Control Video", ""), - ("Transfer Human Motion", "PV"), - ("Transfer Depth", "DV"), - ("Transfer Canny Edges", "EV"), - ("Use LTXV raw format", "V"), - ], - value=filter_letters(video_prompt_type_value, "PDEV"), - label="Control Video Process", scale = 2, visible= True, show_label= True, - ) + guide_preprocessing_labels_all = { + "": "No Control Video", + "UV": "Keep Control Video Unchanged", + "PV": f"Transfer Human {pose_label}", + "PMV": f"Transfer Human {pose_label}", + "DV": "Transfer Depth", + "EV": "Transfer Canny Edges", + "SV": "Transfer Shapes", + "LV": "Transfer Flow", + "CV": "Recolorize", + "MV": "Perform Inpainting", + "V": "Use Vace raw format", + "PDV": f"Transfer Human {pose_label} & Depth", + "PSV": f"Transfer Human {pose_label} & Shapes", + "PLV": f"Transfer Human {pose_label} & Flow" , + "DSV": "Transfer Depth & Shapes", + "DLV": "Transfer Depth & Flow", + "SLV": "Transfer Shapes & Flow", + } + guide_preprocessing_choices = [] + guide_preprocessing_labels = guide_preprocessing.get("labels", {}) + for process_type in guide_preprocessing["selection"]: + process_label = guide_preprocessing_labels.get(process_type, None) + process_label = guide_preprocessing_labels_all.get(process_type,process_type) if process_label is None else process_label + if image_outputs: process_label = process_label.replace("Video", "Image") + guide_preprocessing_choices.append( (process_label, process_type) ) - elif hunyuan_video_custom_edit: + video_prompt_type_video_guide_label = guide_preprocessing.get("label", "Control Video Process") + if image_outputs: video_prompt_type_video_guide_label = video_prompt_type_video_guide_label.replace("Video", "Image") video_prompt_type_video_guide = gr.Dropdown( - choices=[ - ("Inpaint Control Image" if image_outputs else "Inpaint Control Video", "MV"), - ("Transfer Human Motion", "PMV"), - ], - value=filter_letters(video_prompt_type_value, "PDSLCMUV"), - label="Image to Image" if image_outputs else "Video to Video", scale = 3, visible= True, show_label= True, - ) - elif recammaster: - video_prompt_type_video_guide = gr.Dropdown(value="UV", choices = [("Control Video","UV")], visible=False) + guide_preprocessing_choices, + value=filter_letters(video_prompt_type_value, "PDESLCMUV", guide_preprocessing.get("default", "") ), + label= video_prompt_type_video_guide_label , scale = 2, visible= guide_preprocessing.get("visible", True) , show_label= True, + ) + any_control_video = True + any_control_image = image_outputs + + # Alternate Control Video Preprocessing / Options + if guide_custom_choices is None: + video_prompt_type_video_guide_alt = gr.Dropdown(choices=[("","")], value="", label="Control Video", visible= False, scale = 2 ) else: - any_control_video = False - any_control_image = False - video_prompt_type_video_guide = gr.Dropdown(value="", choices = [("","")], visible=False) - - if infinitetalk: + video_prompt_type_video_guide_alt_label = guide_custom_choices.get("label", "Control Video Process") + if image_outputs: video_prompt_type_video_guide_alt_label = video_prompt_type_video_guide_alt_label.replace("Video", "Image") + video_prompt_type_video_guide_alt_choices = [(label.replace("Video", "Image") if image_outputs else label, value) for label,value in guide_custom_choices["choices"] ] video_prompt_type_video_guide_alt = gr.Dropdown( - choices=[ - ("Images to Video, each Reference Image will start a new shot with a new Sliding Window - Sharp Transitions", "QKI"), - ("Images to Video, each Reference Image will start a new shot with a new Sliding Window - Smooth Transitions", "KI"), - ("Sparse Video to Video, one Image will by extracted from Video for each new Sliding Window - Sharp Transitions", "QRUV"), - ("Sparse Video to Video, one Image will by extracted from Video for each new Sliding Window - Smooth Transitions", "RUV"), - ("Video to Video, amount of motion transferred depends on Denoising Strength - Sharp Transitions", "GQUV"), - ("Video to Video, amount of motion transferred depends on Denoising Strength - Smooth Transitions", "GUV"), - ], - value=filter_letters(video_prompt_type_value, "RGUVQKI"), - label="Video to Video", scale = 3, visible= True, show_label= False, - ) - any_control_video = any_control_image = True - else: - video_prompt_type_video_guide_alt = gr.Dropdown(value="", choices = [("","")], visible=False) + choices= video_prompt_type_video_guide_alt_choices, + value=filter_letters(video_prompt_type_value, guide_custom_choices["letters_filter"], guide_custom_choices.get("default", "") ), + visible = guide_custom_choices.get("visible", True), + label= video_prompt_type_video_guide_alt_label, show_label= guide_custom_choices.get("show_label", True), scale = 2 + ) + any_control_video = True + any_control_image = image_outputs - # video_prompt_video_guide_trigger = gr.Text(visible=False, value="") - if t2v: - video_prompt_type_video_mask = gr.Dropdown(value = "", choices = [""], visible = False) - elif hunyuan_video_custom_edit: - video_prompt_type_video_mask = gr.Dropdown( - choices=[ - ("Masked Area", "A"), - ("Non Masked Area", "NA"), - ], - value= filter_letters(video_prompt_type_value, "NA"), - visible= "V" in video_prompt_type_value, - label="Area Processed", scale = 2, show_label= True, - ) - elif ltxv: - video_prompt_type_video_mask = gr.Dropdown( - choices=[ - ("Whole Frame", ""), - ("Masked Area", "A"), - ("Non Masked Area", "NA"), - ("Masked Area, rest Inpainted", "XA"), - ("Non Masked Area, rest Inpainted", "XNA"), - ], - value= filter_letters(video_prompt_type_value, "XNA"), - visible= "V" in video_prompt_type_value and not "U" in video_prompt_type_value, - label="Area Processed", scale = 2, show_label= True, - ) + # Control Mask Preprocessing + if mask_preprocessing is None: + video_prompt_type_video_mask = gr.Dropdown(choices=[("","")], value="", label="Video Mask", scale = 2, visible= False, show_label= True, ) else: + mask_preprocessing_labels_all = { + "": "Whole Frame", + "A": "Masked Area", + "NA": "Non Masked Area", + "XA": "Masked Area, rest Inpainted", + "XNA": "Non Masked Area, rest Inpainted", + "YA": "Masked Area, rest Depth", + "YNA": "Non Masked Area, rest Depth", + "WA": "Masked Area, rest Shapes", + "WNA": "Non Masked Area, rest Shapes", + "ZA": "Masked Area, rest Flow", + "ZNA": "Non Masked Area, rest Flow" + } + + mask_preprocessing_choices = [] + mask_preprocessing_labels = guide_preprocessing.get("labels", {}) + for process_type in mask_preprocessing["selection"]: + process_label = mask_preprocessing_labels.get(process_type, None) + process_label = mask_preprocessing_labels_all.get(process_type, process_type) if process_label is None else process_label + mask_preprocessing_choices.append( (process_label, process_type) ) + + video_prompt_type_video_mask_label = guide_preprocessing.get("label", "Area Processed") video_prompt_type_video_mask = gr.Dropdown( - choices=[ - ("Whole Frame", ""), - ("Masked Area", "A"), - ("Non Masked Area", "NA"), - ("Masked Area, rest Inpainted", "XA"), - ("Non Masked Area, rest Inpainted", "XNA"), - ("Masked Area, rest Depth", "YA"), - ("Non Masked Area, rest Depth", "YNA"), - ("Masked Area, rest Shapes", "WA"), - ("Non Masked Area, rest Shapes", "WNA"), - ("Masked Area, rest Flow", "ZA"), - ("Non Masked Area, rest Flow", "ZNA"), - ], - value= filter_letters(video_prompt_type_value, "XYZWNA"), - visible= "V" in video_prompt_type_value and not "U" in video_prompt_type_value and not hunyuan_video_custom and not ltxv, - label="Area Processed", scale = 2, show_label= True, - ) - image_ref_choices = model_def.get("image_ref_choices", None) - if image_ref_choices is not None: - video_prompt_type_image_refs = gr.Dropdown( - choices= image_ref_choices["choices"], - value=filter_letters(video_prompt_type_value, image_ref_choices["letters_filter"]), - visible = True, - label=image_ref_choices["label"], show_label= True, scale = 2 - ) - elif t2v: - video_prompt_type_image_refs = gr.Dropdown(value="", label="Ref Image", choices=[""], visible =False) - elif vace: - video_prompt_type_image_refs = gr.Dropdown( - choices=[ - ("None", ""), - ("Inject only People / Objects", "I"), - ("Inject Landscape and then People / Objects", "KI"), - ("Inject Frames and then People / Objects", "FI"), - ], - value=filter_letters(video_prompt_type_value, "KFI"), - visible = True, - label="Reference Images", show_label= True, scale = 2 - ) - elif standin: # and not vace - video_prompt_type_image_refs = gr.Dropdown( - choices=[ - ("No Reference Image", ""), - ("Reference Image is a Person Face", "I"), - ], - value=filter_letters(video_prompt_type_value, "I"), - visible = True, - show_label=False, - label="Reference Image", scale = 2 + mask_preprocessing_choices, + value=filter_letters(video_prompt_type_value, "XYZWNA", mask_preprocessing.get("default", "")), + label= video_prompt_type_video_mask_label , scale = 2, visible= "V" in video_prompt_type_value and not "U" in video_prompt_type_value and mask_preprocessing.get("visible", True), + show_label= True, ) - elif (flux or qwen) and model_reference_image: + + # Image Refs Selection + if image_ref_choices is None: video_prompt_type_image_refs = gr.Dropdown( - choices=[ - ("None", ""), - ("Conditional Images is first Main Subject / Landscape and may be followed by People / Objects", "KI"), - ("Conditional Images are People / Objects", "I"), - ], - value=filter_letters(video_prompt_type_value, "KI"), - visible = True, - show_label=False, - label="Reference Images Combination Method", scale = 2 - ) - else: - video_prompt_type_image_refs = gr.Dropdown( - choices=[ ("None", ""),("Start", "KI"),("Ref Image", "I")], - value=filter_letters(video_prompt_type_value, "KI"), + # choices=[ ("None", ""),("Start", "KI"),("Ref Image", "I")], + choices=[ ("None", ""),], + value=filter_letters(video_prompt_type_value, ""), visible = False, label="Start / Reference Images", scale = 2 ) - image_guide = gr.Image(label= "Control Image", height = gallery_height, type ="pil", visible= image_outputs and "V" in video_prompt_type_value, value= ui_defaults.get("image_guide", None)) - video_guide = gr.Video(label= "Control Video", height = gallery_height, visible= (not image_outputs) and "V" in video_prompt_type_value, value= ui_defaults.get("video_guide", None)) + any_reference_image = False + else: + any_reference_image = True + video_prompt_type_image_refs = gr.Dropdown( + choices= image_ref_choices["choices"], + value=filter_letters(video_prompt_type_value, image_ref_choices["letters_filter"]), + visible = image_ref_choices.get("visible", True), + label=image_ref_choices.get("label", "Ref. Images Type"), show_label= True, scale = 2 + ) - denoising_strength = gr.Slider(0, 1, value= ui_defaults.get("denoising_strength" ,0.5), step=0.01, label="Denoising Strength (the Lower the Closer to the Control Video)", visible = "G" in video_prompt_type_value, show_reset_button= False) + image_guide = gr.Image(label= "Control Image", height = gallery_height, type ="pil", visible= image_mode_value==1 and "V" in video_prompt_type_value, value= ui_defaults.get("image_guide", None)) + video_guide = gr.Video(label= "Control Video", height = gallery_height, visible= (not image_outputs) and "V" in video_prompt_type_value, value= ui_defaults.get("video_guide", None)) + if image_mode_value == 2 and inpaint_support: + image_guide_value = ui_defaults.get("image_guide", None) + image_mask_value = ui_defaults.get("image_mask", None) + if image_guide_value is None: + image_mask_guide_value = None + else: + def rgb_bw_to_rgba_mask(img, thresh=127): + a = img.convert('L').point(lambda p: 255 if p > thresh else 0) # alpha + out = Image.new('RGBA', img.size, (255, 255, 255, 0)) # white, transparent + out.putalpha(a) # white where alpha=255 + return out + + image_mask_value = rgb_bw_to_rgba_mask(image_mask_value) + image_mask_guide_value = { "background" : image_guide_value, "composite" : None, "layers": [image_mask_value] } + + image_mask_guide = gr.ImageEditor( + label="Control Image to be Inpainted", + value = image_mask_guide_value, + type='pil', + sources=["upload", "webcam"], + image_mode='RGB', + layers=False, + brush=gr.Brush(colors=["#FFFFFF"], color_mode="fixed"), + # fixed_canvas= True, + width=800, + height=800, + # transforms=None, + # interactive=True, + elem_id="img_editor", + visible= True + ) + any_control_image = True + else: + image_mask_guide = gr.ImageEditor(value = None, visible = False, elem_id="img_editor") + + + denoising_strength = gr.Slider(0, 1, value= ui_defaults.get("denoising_strength" ,0.5), step=0.01, label=f"Denoising Strength (the Lower the Closer to the Control {'Image' if image_outputs else 'Video'})", visible = "G" in video_prompt_type_value, show_reset_button= False) keep_frames_video_guide_visible = not image_outputs and "V" in video_prompt_type_value and not model_def.get("keep_frames_video_guide_not_supported", False) keep_frames_video_guide = gr.Text(value=ui_defaults.get("keep_frames_video_guide","") , visible= keep_frames_video_guide_visible , scale = 2, label= "Frames to keep in Control Video (empty=All, 1=first, a:b for a range, space to separate values)" ) #, -1=last @@ -7325,11 +7432,10 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non video_guide_outpainting_left = gr.Slider(0, 100, value= video_guide_outpainting_list[2], step=5, label="Left %", show_reset_button= False) video_guide_outpainting_right = gr.Slider(0, 100, value= video_guide_outpainting_list[3], step=5, label="Right %", show_reset_button= False) any_image_mask = image_outputs and vace - image_mask = gr.Image(label= "Image Mask Area (for Inpainting, white = Control Area, black = Unchanged)", type ="pil", visible= image_outputs and "V" in video_prompt_type_value and "A" in video_prompt_type_value and not "U" in video_prompt_type_value , height = gallery_height, value= ui_defaults.get("image_mask", None)) + image_mask = gr.Image(label= "Image Mask Area (for Inpainting, white = Control Area, black = Unchanged)", type ="pil", visible= image_mode_value==1 and "V" in video_prompt_type_value and "A" in video_prompt_type_value and not "U" in video_prompt_type_value , height = gallery_height, value= ui_defaults.get("image_mask", None)) video_mask = gr.Video(label= "Video Mask Area (for Inpainting, white = Control Area, black = Unchanged)", visible= (not image_outputs) and "V" in video_prompt_type_value and "A" in video_prompt_type_value and not "U" in video_prompt_type_value , height = gallery_height, value= ui_defaults.get("video_mask", None)) mask_expand = gr.Slider(-10, 50, value=ui_defaults.get("mask_expand", 0), step=1, label="Expand / Shrink Mask Area", visible= "V" in video_prompt_type_value and "A" in video_prompt_type_value and not "U" in video_prompt_type_value ) - any_reference_image = vace or phantom or hunyuan_video_custom or hunyuan_video_avatar or infinitetalk or (flux or qwen) and model_reference_image image_refs_single_image_mode = model_def.get("one_image_ref_needed", False) image_refs_label = "Start Image" if hunyuan_video_avatar else ("Reference Image" if image_refs_single_image_mode else "Reference Images") + (" (each Image will start a new Clip)" if infinitetalk else "") @@ -7424,10 +7530,13 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non visible= True, show_label= not on_demand_prompt_enhancer, ) with gr.Row(): - if server_config.get("fit_canvas", 0) == 1: - label = "Max Resolution (As it maybe less depending on video width / height ratio)" + fit_canvas = server_config.get("fit_canvas", 0) + if fit_canvas == 1: + label = "Outer Box Resolution (one dimension may be less to preserve video W/H ratio)" + elif fit_canvas == 2: + label = "Output Resolution (Input Images wil be Cropped if the W/H ratio is different)" else: - label = "Max Resolution (Pixels will be reallocated depending on the output width / height ratio)" + label = "Resolution Budget (Pixels will be reallocated to preserve Inputs W/H ratio)" current_resolution_choice = ui_defaults.get("resolution","832x480") if update_form or last_resolution is None else last_resolution model_resolutions = model_def.get("resolutions", None) resolution_choices, current_resolution_choice = get_resolution_choices(current_resolution_choice, model_resolutions) @@ -7751,7 +7860,7 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non sliding_window_defaults = model_def.get("sliding_window_defaults", {}) sliding_window_size = gr.Slider(5, get_max_frames(257), value=ui_defaults.get("sliding_window_size", 129), step=4, label="Sliding Window Size") sliding_window_overlap = gr.Slider(sliding_window_defaults.get("overlap_min", 1), sliding_window_defaults.get("overlap_max", 97), value=ui_defaults.get("sliding_window_overlap",sliding_window_defaults.get("overlap_default", 5)), step=sliding_window_defaults.get("overlap_step", 4), label="Windows Frames Overlap (needed to maintain continuity between windows, a higher value will require more windows)") - sliding_window_color_correction_strength = gr.Slider(0, 1, value=ui_defaults.get("sliding_window_color_correction_strength",1), step=0.01, label="Color Correction Strength (match colors of new window with previous one, 0 = disabled)") + sliding_window_color_correction_strength = gr.Slider(0, 1, value=ui_defaults.get("sliding_window_color_correction_strength",1), step=0.01, label="Color Correction Strength (match colors of new window with previous one, 0 = disabled)", visible = True) sliding_window_overlap_noise = gr.Slider(0, 150, value=ui_defaults.get("sliding_window_overlap_noise",20 if vace else 0), step=1, label="Noise to be added to overlapped frames to reduce blur effect" , visible = vace) sliding_window_discard_last_frames = gr.Slider(0, 20, value=ui_defaults.get("sliding_window_discard_last_frames", 0), step=4, label="Discard Last Frames of a Window (that may have bad quality)", visible = True) @@ -7902,7 +8011,7 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non add_to_queue_trigger = gr.Text(visible = False) with gr.Column(visible= False) as current_gen_column: - with gr.Accordion("Preview", open=False) as queue_accordion: + with gr.Accordion("Preview", open=False): preview = gr.Image(label="Preview", height=200, show_label= False) preview_trigger = gr.Text(visible= False) gen_info = gr.HTML(visible=False, min_height=1) @@ -7947,8 +8056,8 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non video_guide_outpainting_checkbox, video_guide_outpainting_row, show_advanced, video_info_to_control_video_btn, video_info_to_video_source_btn, sample_solver_row, video_buttons_row, image_buttons_row, video_postprocessing_tab, audio_remuxing_tab, PP_MMAudio_row, PP_custom_audio_row, video_info_to_start_image_btn, video_info_to_end_image_btn, video_info_to_reference_image_btn, video_info_to_image_guide_btn, video_info_to_image_mask_btn, - NAG_col, speakers_locations_row, embedded_guidance_row, guidance_phases_row, guidance_row, resolution_group, cfg_free_guidance_col, control_net_weights_row, image_mode_tabs, - min_frames_if_references_col, video_prompt_type_alignment, prompt_enhancer_btn] + image_start_extra + image_end_extra + image_refs_extra # presets_column, + NAG_col, speakers_locations_row, embedded_guidance_row, guidance_phases_row, guidance_row, resolution_group, cfg_free_guidance_col, control_net_weights_row, guide_selection_row, image_mode_tabs, + min_frames_if_references_col, video_prompt_type_alignment, prompt_enhancer_btn, tab_inpaint, tab_t2v] + image_start_extra + image_end_extra + image_refs_extra # presets_column, if update_form: locals_dict = locals() gen_inputs = [state_dict if k=="state" else locals_dict[k] for k in inputs_names] + [state_dict] + extra_inputs @@ -7970,10 +8079,10 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non image_prompt_type_radio.change(fn=refresh_image_prompt_type_radio, inputs=[state, image_prompt_type, image_prompt_type_radio], outputs=[image_prompt_type, image_start_row, image_end_row, video_source, keep_frames_video_source, image_prompt_type_endcheckbox], show_progress="hidden" ) image_prompt_type_endcheckbox.change(fn=refresh_image_prompt_type_endcheckbox, inputs=[state, image_prompt_type, image_prompt_type_radio, image_prompt_type_endcheckbox], outputs=[image_prompt_type, image_end_row] ) # video_prompt_video_guide_trigger.change(fn=refresh_video_prompt_video_guide_trigger, inputs=[state, video_prompt_type, video_prompt_video_guide_trigger], outputs=[video_prompt_type, video_prompt_type_video_guide, video_guide, keep_frames_video_guide, denoising_strength, video_guide_outpainting_col, video_prompt_type_video_mask, video_mask, mask_expand]) - video_prompt_type_image_refs.input(fn=refresh_video_prompt_type_image_refs, inputs = [state, video_prompt_type, video_prompt_type_image_refs], outputs = [video_prompt_type, image_refs_row, remove_background_images_ref, image_refs_relative_size, frames_positions,video_guide_outpainting_col]) - video_prompt_type_video_guide.input(fn=refresh_video_prompt_type_video_guide, inputs = [state, video_prompt_type, video_prompt_type_video_guide, image_mode], outputs = [video_prompt_type, video_guide, image_guide, keep_frames_video_guide, denoising_strength, video_guide_outpainting_col, video_prompt_type_video_mask, video_mask, image_mask, mask_expand]) - video_prompt_type_video_guide_alt.input(fn=refresh_video_prompt_type_video_guide_alt, inputs = [state, video_prompt_type, video_prompt_type_video_guide_alt], outputs = [video_prompt_type, video_guide, image_refs_row, denoising_strength ]) - video_prompt_type_video_mask.input(fn=refresh_video_prompt_type_video_mask, inputs = [state, video_prompt_type, video_prompt_type_video_mask, image_mode], outputs = [video_prompt_type, video_mask, image_mask, mask_expand]) + video_prompt_type_image_refs.input(fn=refresh_video_prompt_type_image_refs, inputs = [state, video_prompt_type, video_prompt_type_image_refs], outputs = [video_prompt_type, image_refs_row, remove_background_images_ref, image_refs_relative_size, frames_positions,video_guide_outpainting_col], show_progress="hidden") + video_prompt_type_video_guide.input(fn=refresh_video_prompt_type_video_guide, inputs = [state, video_prompt_type, video_prompt_type_video_guide, image_mode], outputs = [video_prompt_type, video_guide, image_guide, keep_frames_video_guide, denoising_strength, video_guide_outpainting_col, video_prompt_type_video_mask, video_mask, image_mask, mask_expand], show_progress="hidden") + video_prompt_type_video_guide_alt.input(fn=refresh_video_prompt_type_video_guide_alt, inputs = [state, video_prompt_type, video_prompt_type_video_guide_alt], outputs = [video_prompt_type, video_guide, image_refs_row, denoising_strength ], show_progress="hidden") + video_prompt_type_video_mask.input(fn=refresh_video_prompt_type_video_mask, inputs = [state, video_prompt_type, video_prompt_type_video_mask, image_mode], outputs = [video_prompt_type, video_mask, image_mask, mask_expand], show_progress="hidden") video_prompt_type_alignment.input(fn=refresh_video_prompt_type_alignment, inputs = [state, video_prompt_type, video_prompt_type_alignment], outputs = [video_prompt_type]) multi_prompts_gen_type.select(fn=refresh_prompt_labels, inputs=[multi_prompts_gen_type, image_mode], outputs=[prompt, wizard_prompt, image_end], show_progress="hidden") video_guide_outpainting_top.input(fn=update_video_guide_outpainting, inputs=[video_guide_outpainting, video_guide_outpainting_top, gr.State(0)], outputs = [video_guide_outpainting], trigger_mode="multiple" ) @@ -7984,8 +8093,8 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non show_advanced.change(fn=switch_advanced, inputs=[state, show_advanced, lset_name], outputs=[advanced_row, preset_buttons_rows, refresh_lora_btn, refresh2_row ,lset_name]).then( fn=switch_prompt_type, inputs = [state, wizard_prompt_activated_var, wizard_variables_var, prompt, wizard_prompt, *prompt_vars], outputs = [wizard_prompt_activated_var, wizard_variables_var, prompt, wizard_prompt, prompt_column_advanced, prompt_column_wizard, prompt_column_wizard_vars, *prompt_vars]) queue_df.select( fn=handle_celll_selection, inputs=state, outputs=[queue_df, modal_image_display, modal_container]) - gr.on( triggers=[output.change, output.select], fn=select_video, inputs=[state, output], outputs=[last_choice, video_info, video_buttons_row, image_buttons_row, video_postprocessing_tab, audio_remuxing_tab]) - preview_trigger.change(refresh_preview, inputs= [state], outputs= [preview]) + gr.on( triggers=[output.change, output.select], fn=select_video, inputs=[state, output], outputs=[last_choice, video_info, video_buttons_row, image_buttons_row, video_postprocessing_tab, audio_remuxing_tab], show_progress="hidden") + preview_trigger.change(refresh_preview, inputs= [state], outputs= [preview], show_progress="hidden") PP_MMAudio_setting.change(fn = lambda value : [gr.update(visible = value == 1), gr.update(visible = value == 0)] , inputs = [PP_MMAudio_setting], outputs = [PP_MMAudio_row, PP_custom_audio_row] ) def refresh_status_async(state, progress=gr.Progress()): gen = get_gen_info(state) @@ -8017,7 +8126,9 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non output_trigger.change(refresh_gallery, inputs = [state], - outputs = [output, gen_info, generate_btn, add_to_queue_btn, current_gen_column, current_gen_buttons_row, queue_df, abort_btn, onemorewindow_btn]) + outputs = [output, gen_info, generate_btn, add_to_queue_btn, current_gen_column, current_gen_buttons_row, queue_df, abort_btn, onemorewindow_btn], + show_progress="hidden" + ) preview_column_no.input(show_preview_column_modal, inputs=[state, preview_column_no], outputs=[preview_column_no, modal_image_display, modal_container]) @@ -8033,7 +8144,8 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non gr.on( triggers=[video_info_extract_settings_btn.click, video_info_extract_image_settings_btn.click], fn=validate_wizard_prompt, inputs= [state, wizard_prompt_activated_var, wizard_variables_var, prompt, wizard_prompt, *prompt_vars] , - outputs= [prompt] + outputs= [prompt], + show_progress="hidden", ).then(fn=save_inputs, inputs =[target_state] + gen_inputs, outputs= None @@ -8042,7 +8154,8 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non prompt_enhancer_btn.click(fn=validate_wizard_prompt, inputs= [state, wizard_prompt_activated_var, wizard_variables_var, prompt, wizard_prompt, *prompt_vars] , - outputs= [prompt] + outputs= [prompt], + show_progress="hidden", ).then(fn=save_inputs, inputs =[target_state] + gen_inputs, outputs= None @@ -8050,7 +8163,8 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non saveform_trigger.change(fn=validate_wizard_prompt, inputs= [state, wizard_prompt_activated_var, wizard_variables_var, prompt, wizard_prompt, *prompt_vars] , - outputs= [prompt] + outputs= [prompt], + show_progress="hidden", ).then(fn=save_inputs, inputs =[target_state] + gen_inputs, outputs= None @@ -8065,14 +8179,14 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non video_info_to_video_source_btn.click(fn=video_to_source_video, inputs =[state, output, last_choice], outputs = [video_source] ) video_info_to_start_image_btn.click(fn=image_to_ref_image_add, inputs =[state, output, last_choice, image_start, gr.State("Start Image")], outputs = [image_start] ) video_info_to_end_image_btn.click(fn=image_to_ref_image_add, inputs =[state, output, last_choice, image_end, gr.State("End Image")], outputs = [image_end] ) - video_info_to_image_guide_btn.click(fn=image_to_ref_image_set, inputs =[state, output, last_choice, image_guide, gr.State("Control Image")], outputs = [image_guide] ) + video_info_to_image_guide_btn.click(fn=image_to_ref_image_guide, inputs =[state, output, last_choice], outputs = [image_guide, image_mask_guide] ) video_info_to_image_mask_btn.click(fn=image_to_ref_image_set, inputs =[state, output, last_choice, image_mask, gr.State("Image Mask")], outputs = [image_mask] ) video_info_to_reference_image_btn.click(fn=image_to_ref_image_add, inputs =[state, output, last_choice, image_refs, gr.State("Ref Image")], outputs = [image_refs] ) video_info_postprocessing_btn.click(fn=apply_post_processing, inputs =[state, output, last_choice, PP_temporal_upsampling, PP_spatial_upsampling, PP_film_grain_intensity, PP_film_grain_saturation], outputs = [mode, generate_trigger, add_to_queue_trigger ] ) video_info_remux_audio_btn.click(fn=remux_audio, inputs =[state, output, last_choice, PP_MMAudio_setting, PP_MMAudio_prompt, PP_MMAudio_neg_prompt, PP_MMAudio_seed, PP_repeat_generation, PP_custom_audio], outputs = [mode, generate_trigger, add_to_queue_trigger ] ) save_lset_btn.click(validate_save_lset, inputs=[state, lset_name], outputs=[apply_lset_btn, refresh_lora_btn, delete_lset_btn, save_lset_btn,confirm_save_lset_btn, cancel_lset_btn, save_lset_prompt_drop]) delete_lset_btn.click(validate_delete_lset, inputs=[state, lset_name], outputs=[apply_lset_btn, refresh_lora_btn, delete_lset_btn, save_lset_btn,confirm_delete_lset_btn, cancel_lset_btn ]) - confirm_save_lset_btn.click(fn=validate_wizard_prompt, inputs =[state, wizard_prompt_activated_var, wizard_variables_var, prompt, wizard_prompt, *prompt_vars] , outputs= [prompt]).then( + confirm_save_lset_btn.click(fn=validate_wizard_prompt, inputs =[state, wizard_prompt_activated_var, wizard_variables_var, prompt, wizard_prompt, *prompt_vars] , outputs= [prompt], show_progress="hidden",).then( fn=save_inputs, inputs =[target_state] + gen_inputs, outputs= None).then( @@ -8087,7 +8201,8 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non lset_name.select(fn=update_lset_type, inputs=[state, lset_name], outputs=save_lset_prompt_drop) export_settings_from_file_btn.click(fn=validate_wizard_prompt, inputs= [state, wizard_prompt_activated_var, wizard_variables_var, prompt, wizard_prompt, *prompt_vars] , - outputs= [prompt] + outputs= [prompt], + show_progress="hidden", ).then(fn=save_inputs, inputs =[target_state] + gen_inputs, outputs= None @@ -8104,7 +8219,8 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non image_mode_tabs.select(fn=record_image_mode_tab, inputs=[state], outputs= None ).then(fn=validate_wizard_prompt, inputs= [state, wizard_prompt_activated_var, wizard_variables_var, prompt, wizard_prompt, *prompt_vars] , - outputs= [prompt] + outputs= [prompt], + show_progress="hidden", ).then(fn=save_inputs, inputs =[target_state] + gen_inputs, outputs= None @@ -8112,7 +8228,8 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non settings_file.upload(fn=validate_wizard_prompt, inputs= [state, wizard_prompt_activated_var, wizard_variables_var, prompt, wizard_prompt, *prompt_vars] , - outputs= [prompt] + outputs= [prompt], + show_progress="hidden", ).then(fn=save_inputs, inputs =[target_state] + gen_inputs, outputs= None @@ -8126,17 +8243,20 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non refresh_form_trigger.change(fn= fill_inputs, inputs=[state], - outputs=gen_inputs + extra_inputs + outputs=gen_inputs + extra_inputs, + show_progress= "full" if args.debug_gen_form else "hidden", ).then(fn=validate_wizard_prompt, inputs= [state, wizard_prompt_activated_var, wizard_variables_var, prompt, wizard_prompt, *prompt_vars], - outputs= [prompt] + outputs= [prompt], + show_progress="hidden", ) model_family.input(fn=change_model_family, inputs=[state, model_family], outputs= [model_choice]) model_choice.change(fn=validate_wizard_prompt, inputs= [state, wizard_prompt_activated_var, wizard_variables_var, prompt, wizard_prompt, *prompt_vars] , - outputs= [prompt] + outputs= [prompt], + show_progress="hidden", ).then(fn=save_inputs, inputs =[target_state] + gen_inputs, outputs= None @@ -8145,7 +8265,8 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non outputs= [header] ).then(fn= fill_inputs, inputs=[state], - outputs=gen_inputs + extra_inputs + outputs=gen_inputs + extra_inputs, + show_progress="full" if args.debug_gen_form else "hidden", ).then(fn= preload_model_when_switching, inputs=[state], outputs=[gen_status]) @@ -8154,13 +8275,15 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non generate_trigger.change(fn=validate_wizard_prompt, inputs= [state, wizard_prompt_activated_var, wizard_variables_var, prompt, wizard_prompt, *prompt_vars] , - outputs= [prompt] + outputs= [prompt], + show_progress="hidden", ).then(fn=save_inputs, inputs =[target_state] + gen_inputs, outputs= None ).then(fn=process_prompt_and_add_tasks, inputs = [state, model_choice], - outputs= queue_df + outputs= queue_df, + show_progress="hidden", ).then(fn=prepare_generate_video, inputs= [state], outputs= [generate_btn, add_to_queue_btn, current_gen_column, current_gen_buttons_row] @@ -8170,10 +8293,12 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non ).then( fn=lambda s: gr.Accordion(open=True) if len(get_gen_info(s).get("queue", [])) > 1 else gr.update(), inputs=[state], - outputs=[queue_accordion] + outputs=[queue_accordion], + show_progress="hidden", ).then(fn=process_tasks, inputs= [state], outputs= [preview_trigger, output_trigger], + show_progress="hidden", ).then(finalize_generation, inputs= [state], outputs= [output, abort_btn, generate_btn, add_to_queue_btn, current_gen_column, gen_info] @@ -8280,17 +8405,20 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non # gr.on(triggers=[add_to_queue_btn.click, add_to_queue_trigger.change],fn=validate_wizard_prompt, add_to_queue_trigger.change(fn=validate_wizard_prompt, inputs =[state, wizard_prompt_activated_var, wizard_variables_var, prompt, wizard_prompt, *prompt_vars] , - outputs= [prompt] + outputs= [prompt], + show_progress="hidden", ).then(fn=save_inputs, inputs =[target_state] + gen_inputs, outputs= None ).then(fn=process_prompt_and_add_tasks, inputs = [state, model_choice], - outputs=queue_df + outputs=queue_df, + show_progress="hidden", ).then( fn=lambda s: gr.Accordion(open=True) if len(get_gen_info(s).get("queue", [])) > 1 else gr.update(), inputs=[state], - outputs=[queue_accordion] + outputs=[queue_accordion], + show_progress="hidden", ).then( fn=update_status, inputs = [state], @@ -8302,8 +8430,8 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non outputs=[modal_container] ) - return ( state, loras_choices, lset_name, resolution, - video_guide, image_guide, video_mask, image_mask, image_refs, refresh_form_trigger + return ( state, loras_choices, lset_name, resolution, refresh_form_trigger, + # video_guide, image_guide, video_mask, image_mask, image_refs, ) @@ -8339,8 +8467,9 @@ def generate_configuration_tab(state, blocks, header, model_family, model_choice fit_canvas_choice = gr.Dropdown( choices=[ - ("Dimensions correspond to the Pixels Budget (as the Prompt Image/Video will be resized to match this pixels budget, output video height or width may exceed the requested dimensions )", 0), - ("Dimensions correspond to the Maximum Width and Height (as the Prompt Image/Video will be resized to fit into these dimensions, the output video may be smaller)", 1), + ("Dimensions correspond to the Pixels Budget (as the Prompt Image/Video will be Resized to match this pixels Budget, output video height or width may exceed the requested dimensions )", 0), + ("Dimensions correspond to the Maximum Width and Height (as the Prompt Image/Video will be Resized to fit into these dimensions, the output video may be smaller)", 1), + ("Dimensions correspond to the Output Width and Height (as the Prompt Image/Video will be Cropped to fit exactly these dimensions)", 2), ], value= server_config.get("fit_canvas", 0), label="Generated Video Dimensions when Prompt contains an Image or a Video", @@ -9231,9 +9360,17 @@ def create_ui(): console.log('Events dispatched for column:', index); } }; - console.log('sendColIndex function attached to window'); - } + + // cancel wheel usage inside image editor + const hit = n => n?.id === "img_editor" || n?.classList?.contains("wheel-pass"); + addEventListener("wheel", e => { + const path = e.composedPath?.() || (() => { let a=[],n=e.target; for(;n;n=n.parentNode||n.host) a.push(n); return a; })(); + if (path.some(hit)) e.stopImmediatePropagation(); + }, { capture: true, passive: true }); + + } + """ if server_config.get("display_stats", 0) == 1: from shared.utils.stats import SystemStatsApp @@ -9264,13 +9401,13 @@ def create_ui(): stats_element = stats_app.get_gradio_element() with gr.Row(): - ( state, loras_choices, lset_name, resolution, - video_guide, image_guide, video_mask, image_mask, image_refs, refresh_form_trigger + ( state, loras_choices, lset_name, resolution, refresh_form_trigger + # video_guide, image_guide, video_mask, image_mask, image_refs, ) = generate_video_tab(model_family=model_family, model_choice=model_choice, header=header, main = main, main_tabs =main_tabs) with gr.Tab("Guides", id="info") as info_tab: generate_info_tab() with gr.Tab("Video Mask Creator", id="video_mask_creator") as video_mask_creator: - matanyone_app.display(main_tabs, tab_state, server_config, video_guide, image_guide, video_mask, image_mask, image_refs) + matanyone_app.display(main_tabs, tab_state, state, refresh_form_trigger, server_config, get_current_model_settings) #, video_guide, image_guide, video_mask, image_mask, image_refs) if not args.lock_config: with gr.Tab("Downloads", id="downloads") as downloads_tab: generate_download_tab(lset_name, loras_choices, state) From e7c08d12c846ddf4817013e1b112b1ecf183ada3 Mon Sep 17 00:00:00 2001 From: DeepBeepMeep Date: Wed, 10 Sep 2025 01:47:55 +0200 Subject: [PATCH 2/3] fixed unwanted discontinuity with at the end of first sliding window with InfiniteTalk --- models/wan/any2video.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/models/wan/any2video.py b/models/wan/any2video.py index bb91dc6..1e62c35 100644 --- a/models/wan/any2video.py +++ b/models/wan/any2video.py @@ -539,7 +539,7 @@ class WanAny2V: new_shot = "Q" in video_prompt_type else: if pre_video_frame is None: - new_shot = True + new_shot = "Q" in video_prompt_type else: if input_ref_images is None: input_ref_images, new_shot = [pre_video_frame], False From 9fa267087b2dfdba651fd173325537f031edf91d Mon Sep 17 00:00:00 2001 From: DeepBeepMeep Date: Thu, 11 Sep 2025 21:23:05 +0200 Subject: [PATCH 3/3] Flux Festival --- README.md | 9 +++- defaults/flux_dev_umo.json | 24 ++++++++++ defaults/flux_dev_uso.json | 2 +- defaults/flux_srpo.json | 15 ++++++ defaults/flux_srpo_uso.json | 17 +++++++ models/flux/flux_handler.py | 22 +++++++++ models/flux/flux_main.py | 77 +++++++++++++++++++++++++------ models/flux/model.py | 15 ++++++ models/flux/sampling.py | 57 +++++++++++++++++++++-- models/flux/util.py | 32 +++++++++++++ models/qwen/pipeline_qwenimage.py | 19 +++++--- models/qwen/qwen_handler.py | 1 + models/wan/any2video.py | 2 +- models/wan/wan_handler.py | 1 + wgp.py | 62 ++++++++++++++++--------- 15 files changed, 305 insertions(+), 50 deletions(-) create mode 100644 defaults/flux_dev_umo.json create mode 100644 defaults/flux_srpo.json create mode 100644 defaults/flux_srpo_uso.json diff --git a/README.md b/README.md index fc3d76c..d33b6dc 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,7 @@ WanGP supports the Wan (and derived models), Hunyuan Video and LTV Video models **Follow DeepBeepMeep on Twitter/X to get the Latest News**: https://x.com/deepbeepmeep ## 🔥 Latest Updates : -### September 5 2025: WanGP v8.5 - Wanna be a Cropper or a Painter ? +### September 11 2025: WanGP v8.5/8.55 - Wanna be a Cropper or a Painter ? I have done some intensive internal refactoring of the generation pipeline to ease support of existing models or add new models. Nothing really visible but this makes WanGP is little more future proof. @@ -38,6 +38,13 @@ Doing more sophisticated thing Vace Image Editor works very well too: try Image For the best quality I recommend to set in *Quality Tab* the option: "*Generate a 9 Frames Long video...*" +**update 8.55**: Flux Festival +- **Inpainting Mode** also added for *Flux Kontext* +- **Flux SRPO** : new finetune with x3 better quality vs Flux Dev according to its authors. I have also created a *Flux SRPO USO* finetune which is certainly the best open source *Style Transfer* tool available +- **Flux UMO**: model specialized in combining multiple reference objects / people together. Works quite well at 768x768 + +Good luck with finding your way through all the Flux models names ! + ### September 5 2025: WanGP v8.4 - Take me to Outer Space You have probably seen these short AI generated movies created using *Nano Banana* and the *First Frame - Last Frame* feature of *Kling 2.0*. The idea is to generate an image, modify a part of it with Nano Banana and give the these two images to Kling that will generate the Video between these two images, use now the previous Last Frame as the new First Frame, rinse and repeat and you get a full movie. diff --git a/defaults/flux_dev_umo.json b/defaults/flux_dev_umo.json new file mode 100644 index 0000000..57164bb --- /dev/null +++ b/defaults/flux_dev_umo.json @@ -0,0 +1,24 @@ +{ + "model": { + "name": "Flux 1 Dev UMO 12B", + "architecture": "flux", + "description": "FLUX.1 Dev UMO is a model that can Edit Images with a specialization in combining multiple image references (resized internally at 512x512 max) to produce an Image output. Best Image preservation at 768x768 Resolution Output.", + "URLs": "flux", + "flux-model": "flux-dev-umo", + "loras": ["https://huggingface.co/DeepBeepMeep/Flux/resolve/main/flux1-dev-UMO_dit_lora_bf16.safetensors"], + "resolutions": [ ["1024x1024 (1:1)", "1024x1024"], + ["768x1024 (3:4)", "768x1024"], + ["1024x768 (4:3)", "1024x768"], + ["512x1024 (1:2)", "512x1024"], + ["1024x512 (2:1)", "1024x512"], + ["768x768 (1:1)", "768x768"], + ["768x512 (3:2)", "768x512"], + ["512x768 (2:3)", "512x768"]] + }, + "prompt": "the man is wearing a hat", + "embedded_guidance_scale": 4, + "resolution": "768x768", + "batch_size": 1 +} + + \ No newline at end of file diff --git a/defaults/flux_dev_uso.json b/defaults/flux_dev_uso.json index 0cd7b82..806dd7e 100644 --- a/defaults/flux_dev_uso.json +++ b/defaults/flux_dev_uso.json @@ -2,7 +2,7 @@ "model": { "name": "Flux 1 Dev USO 12B", "architecture": "flux", - "description": "FLUX.1 Dev USO is a model specialized to Edit Images with a specialization in Style Transfers (up to two).", + "description": "FLUX.1 Dev USO is a model that can Edit Images with a specialization in Style Transfers (up to two).", "modules": [ ["https://huggingface.co/DeepBeepMeep/Flux/resolve/main/flux1-dev-USO_projector_bf16.safetensors"]], "URLs": "flux", "loras": ["https://huggingface.co/DeepBeepMeep/Flux/resolve/main/flux1-dev-USO_dit_lora_bf16.safetensors"], diff --git a/defaults/flux_srpo.json b/defaults/flux_srpo.json new file mode 100644 index 0000000..59f07c6 --- /dev/null +++ b/defaults/flux_srpo.json @@ -0,0 +1,15 @@ +{ + "model": { + "name": "Flux 1 SRPO Dev 12B", + "architecture": "flux", + "description": "By fine-tuning the FLUX.1.dev model with optimized denoising and online reward adjustment, SRPO improves its human-evaluated realism and aesthetic quality by over 3x.", + "URLs": [ + "https://huggingface.co/DeepBeepMeep/Flux/resolve/main/flux1-srpo-dev_bf16.safetensors", + "https://huggingface.co/DeepBeepMeep/Flux/resolve/main/flux1-srpo-dev_quanto_bf16_int8.safetensors" + ], + "flux-model": "flux-dev" + }, + "prompt": "draw a hat", + "resolution": "1024x1024", + "batch_size": 1 +} \ No newline at end of file diff --git a/defaults/flux_srpo_uso.json b/defaults/flux_srpo_uso.json new file mode 100644 index 0000000..ddfe50d --- /dev/null +++ b/defaults/flux_srpo_uso.json @@ -0,0 +1,17 @@ +{ + "model": { + "name": "Flux 1 SRPO USO 12B", + "architecture": "flux", + "description": "FLUX.1 SRPO USO is a model that can Edit Images with a specialization in Style Transfers (up to two). It leverages the improved Image quality brought by the SRPO process", + "modules": [ "flux_dev_uso"], + "URLs": "flux_srpo", + "loras": "flux_dev_uso", + "flux-model": "flux-dev-uso" + }, + "prompt": "the man is wearing a hat", + "embedded_guidance_scale": 4, + "resolution": "1024x1024", + "batch_size": 1 +} + + \ No newline at end of file diff --git a/models/flux/flux_handler.py b/models/flux/flux_handler.py index c468d5a..808369f 100644 --- a/models/flux/flux_handler.py +++ b/models/flux/flux_handler.py @@ -13,6 +13,7 @@ class family_handler(): flux_schnell = flux_model == "flux-schnell" flux_chroma = flux_model == "flux-chroma" flux_uso = flux_model == "flux-dev-uso" + flux_umo = flux_model == "flux-dev-umo" flux_kontext = flux_model == "flux-dev-kontext" extra_model_def = { @@ -35,6 +36,7 @@ class family_handler(): } if flux_kontext: + extra_model_def["inpaint_support"] = True extra_model_def["image_ref_choices"] = { "choices": [ ("None", ""), @@ -43,6 +45,15 @@ class family_handler(): ], "letters_filter": "KI", } + extra_model_def["background_removal_label"]= "Remove Backgrounds only behind People / Objects except main Subject / Landscape" + elif flux_umo: + extra_model_def["image_ref_choices"] = { + "choices": [ + ("Conditional Images are People / Objects", "I"), + ], + "letters_filter": "I", + "visible": False + } extra_model_def["lock_image_refs_ratios"] = True @@ -131,10 +142,14 @@ class family_handler(): video_prompt_type = video_prompt_type.replace("I", "KI") ui_defaults["video_prompt_type"] = video_prompt_type + if settings_version < 2.34: + ui_defaults["denoising_strength"] = 1. + @staticmethod def update_default_settings(base_model_type, model_def, ui_defaults): flux_model = model_def.get("flux-model", "flux-dev") flux_uso = flux_model == "flux-dev-uso" + flux_umo = flux_model == "flux-dev-umo" flux_kontext = flux_model == "flux-dev-kontext" ui_defaults.update({ "embedded_guidance": 2.5, @@ -143,5 +158,12 @@ class family_handler(): if flux_kontext or flux_uso: ui_defaults.update({ "video_prompt_type": "KI", + "denoising_strength": 1., }) + elif flux_umo: + ui_defaults.update({ + "video_prompt_type": "I", + "remove_background_images_ref": 0, + }) + diff --git a/models/flux/flux_main.py b/models/flux/flux_main.py index 4d7c67d..6863711 100644 --- a/models/flux/flux_main.py +++ b/models/flux/flux_main.py @@ -23,6 +23,35 @@ from .util import ( ) from PIL import Image +def preprocess_ref(raw_image: Image.Image, long_size: int = 512): + # 获取原始图像的宽度和高度 + image_w, image_h = raw_image.size + + # 计算长边和短边 + if image_w >= image_h: + new_w = long_size + new_h = int((long_size / image_w) * image_h) + else: + new_h = long_size + new_w = int((long_size / image_h) * image_w) + + # 按新的宽高进行等比例缩放 + raw_image = raw_image.resize((new_w, new_h), resample=Image.LANCZOS) + target_w = new_w // 16 * 16 + target_h = new_h // 16 * 16 + + # 计算裁剪的起始坐标以实现中心裁剪 + left = (new_w - target_w) // 2 + top = (new_h - target_h) // 2 + right = left + target_w + bottom = top + target_h + + # 进行中心裁剪 + raw_image = raw_image.crop((left, top, right, bottom)) + + # 转换为 RGB 模式 + raw_image = raw_image.convert("RGB") + return raw_image def stitch_images(img1, img2): # Resize img2 to match img1's height @@ -67,7 +96,7 @@ class model_factory: # self.name= "flux-schnell" source = model_def.get("source", None) self.model = load_flow_model(self.name, model_filename[0] if source is None else source, torch_device) - + self.model_def = model_def self.vae = load_ae(self.name, device=torch_device) siglip_processor = siglip_model = feature_embedder = None @@ -109,10 +138,12 @@ class model_factory: def generate( self, seed: int | None = None, - input_prompt: str = "replace the logo with the text 'Black Forest Labs'", + input_prompt: str = "replace the logo with the text 'Black Forest Labs'", n_prompt: str = None, sampling_steps: int = 20, input_ref_images = None, + image_guide= None, + image_mask= None, width= 832, height=480, embedded_guidance_scale: float = 2.5, @@ -123,7 +154,8 @@ class model_factory: batch_size = 1, video_prompt_type = "", joint_pass = False, - image_refs_relative_size = 100, + image_refs_relative_size = 100, + denoising_strength = 1., **bbargs ): if self._interrupt: @@ -132,8 +164,16 @@ class model_factory: if n_prompt is None or len(n_prompt) == 0: n_prompt = "low quality, ugly, unfinished, out of focus, deformed, disfigure, blurry, smudged, restricted palette, flat colors" device="cuda" flux_dev_uso = self.name in ['flux-dev-uso'] - image_stiching = not self.name in ['flux-dev-uso'] #and False + flux_dev_umo = self.name in ['flux-dev-umo'] + latent_stiching = self.name in ['flux-dev-uso', 'flux-dev-umo'] + + lock_dimensions= False + input_ref_images = [] if input_ref_images is None else input_ref_images[:] + if flux_dev_umo: + ref_long_side = 512 if len(input_ref_images) <= 1 else 320 + input_ref_images = [preprocess_ref(img, ref_long_side) for img in input_ref_images] + lock_dimensions = True ref_style_imgs = [] if "I" in video_prompt_type and len(input_ref_images) > 0: if flux_dev_uso : @@ -143,22 +183,26 @@ class model_factory: elif len(input_ref_images) > 1 : ref_style_imgs = input_ref_images[-1:] input_ref_images = input_ref_images[:-1] - if image_stiching: + + if latent_stiching: + # latents stiching with resize + if not lock_dimensions : + for i in range(len(input_ref_images)): + w, h = input_ref_images[i].size + image_height, image_width = calculate_new_dimensions(int(height*image_refs_relative_size/100), int(width*image_refs_relative_size/100), h, w, 0) + input_ref_images[i] = input_ref_images[i].resize((image_width, image_height), resample=Image.Resampling.LANCZOS) + else: # image stiching method stiched = input_ref_images[0] for new_img in input_ref_images[1:]: stiched = stitch_images(stiched, new_img) input_ref_images = [stiched] - else: - # latents stiching with resize - for i in range(len(input_ref_images)): - w, h = input_ref_images[i].size - image_height, image_width = calculate_new_dimensions(int(height*image_refs_relative_size/100), int(width*image_refs_relative_size/100), h, w, fit_into_canvas) - input_ref_images[i] = input_ref_images[i].resize((image_width, image_height), resample=Image.Resampling.LANCZOS) + elif image_guide is not None: + input_ref_images = [image_guide] else: input_ref_images = None - if flux_dev_uso : + if self.name in ['flux-dev-uso', 'flux-dev-umo'] : inp, height, width = prepare_multi_ip( ae=self.vae, img_cond_list=input_ref_images, @@ -177,6 +221,7 @@ class model_factory: bs=batch_size, seed=seed, device=device, + img_mask=image_mask, ) inp.update(prepare_prompt(self.t5, self.clip, batch_size, input_prompt)) @@ -198,13 +243,19 @@ class model_factory: return unpack(x.float(), height, width) # denoise initial noise - x = denoise(self.model, **inp, timesteps=timesteps, guidance=embedded_guidance_scale, real_guidance_scale =guide_scale, callback=callback, pipeline=self, loras_slists= loras_slists, unpack_latent = unpack_latent, joint_pass = joint_pass) + x = denoise(self.model, **inp, timesteps=timesteps, guidance=embedded_guidance_scale, real_guidance_scale =guide_scale, callback=callback, pipeline=self, loras_slists= loras_slists, unpack_latent = unpack_latent, joint_pass = joint_pass, denoising_strength = denoising_strength) if x==None: return None # decode latents to pixel space x = unpack_latent(x) with torch.autocast(device_type=device, dtype=torch.bfloat16): x = self.vae.decode(x) + if image_mask is not None: + from shared.utils.utils import convert_image_to_tensor + img_msk_rebuilt = inp["img_msk_rebuilt"] + img= convert_image_to_tensor(image_guide) + x = img.squeeze(2) * (1 - img_msk_rebuilt) + x.to(img) * img_msk_rebuilt + x = x.clamp(-1, 1) x = x.transpose(0, 1) return x diff --git a/models/flux/model.py b/models/flux/model.py index c4642d0..c5f7a24 100644 --- a/models/flux/model.py +++ b/models/flux/model.py @@ -190,6 +190,21 @@ class Flux(nn.Module): v = swap_scale_shift(v) k = k.replace("norm_out.linear", "final_layer.adaLN_modulation.1") new_sd[k] = v + # elif not first_key.startswith("diffusion_model.") and not first_key.startswith("transformer."): + # for k,v in sd.items(): + # if "double" in k: + # k = k.replace(".processor.proj_lora1.", ".img_attn.proj.lora_") + # k = k.replace(".processor.proj_lora2.", ".txt_attn.proj.lora_") + # k = k.replace(".processor.qkv_lora1.", ".img_attn.qkv.lora_") + # k = k.replace(".processor.qkv_lora2.", ".txt_attn.qkv.lora_") + # else: + # k = k.replace(".processor.qkv_lora.", ".linear1_qkv.lora_") + # k = k.replace(".processor.proj_lora.", ".linear2.lora_") + + # k = "diffusion_model." + k + # new_sd[k] = v + # from mmgp import safetensors2 + # safetensors2.torch_write_file(new_sd, "fff.safetensors") else: new_sd = sd return new_sd diff --git a/models/flux/sampling.py b/models/flux/sampling.py index f43ae15..1b4813a 100644 --- a/models/flux/sampling.py +++ b/models/flux/sampling.py @@ -138,10 +138,12 @@ def prepare_kontext( target_width: int | None = None, target_height: int | None = None, bs: int = 1, - + img_mask = None, ) -> tuple[dict[str, Tensor], int, int]: # load and encode the conditioning image + res_match_output = img_mask is not None + img_cond_seq = None img_cond_seq_ids = None if img_cond_list == None: img_cond_list = [] @@ -150,9 +152,11 @@ def prepare_kontext( for cond_no, img_cond in enumerate(img_cond_list): width, height = img_cond.size aspect_ratio = width / height - - # Kontext is trained on specific resolutions, using one of them is recommended - _, width, height = min((abs(aspect_ratio - w / h), w, h) for w, h in PREFERED_KONTEXT_RESOLUTIONS) + if res_match_output: + width, height = target_width, target_height + else: + # Kontext is trained on specific resolutions, using one of them is recommended + _, width, height = min((abs(aspect_ratio - w / h), w, h) for w, h in PREFERED_KONTEXT_RESOLUTIONS) width = 2 * int(width / 16) height = 2 * int(height / 16) @@ -193,6 +197,19 @@ def prepare_kontext( "img_cond_seq": img_cond_seq, "img_cond_seq_ids": img_cond_seq_ids, } + if img_mask is not None: + from shared.utils.utils import convert_image_to_tensor, convert_tensor_to_image + # image_height, image_width = calculate_new_dimensions(ref_height, ref_width, image_height, image_width, False, block_size=multiple_of) + image_mask_latents = convert_image_to_tensor(img_mask.resize((target_width // 16, target_height // 16), resample=Image.Resampling.LANCZOS)) + image_mask_latents = torch.where(image_mask_latents>-0.5, 1., 0. )[0:1] + image_mask_rebuilt = image_mask_latents.repeat_interleave(16, dim=-1).repeat_interleave(16, dim=-2).unsqueeze(0) + convert_tensor_to_image( image_mask_rebuilt.squeeze(0).repeat(3,1,1)).save("mmm.png") + image_mask_latents = image_mask_latents.reshape(1, -1, 1).to(device) + return_dict.update({ + "img_msk_latents": image_mask_latents, + "img_msk_rebuilt": image_mask_rebuilt, + }) + img = get_noise( bs, target_height, @@ -264,6 +281,9 @@ def denoise( loras_slists=None, unpack_latent = None, joint_pass= False, + img_msk_latents = None, + img_msk_rebuilt = None, + denoising_strength = 1, ): kwargs = {'pipeline': pipeline, 'callback': callback, "img_len" : img.shape[1], "siglip_embedding": siglip_embedding, "siglip_embedding_ids": siglip_embedding_ids} @@ -271,6 +291,21 @@ def denoise( if callback != None: callback(-1, None, True) + original_image_latents = None if img_cond_seq is None else img_cond_seq.clone() + + morph, first_step = False, 0 + if img_msk_latents is not None: + randn = torch.randn_like(original_image_latents) + if denoising_strength < 1.: + first_step = int(len(timesteps) * (1. - denoising_strength)) + if not morph: + latent_noise_factor = timesteps[first_step] + latents = original_image_latents * (1.0 - latent_noise_factor) + randn * latent_noise_factor + img = latents.to(img) + latents = None + timesteps = timesteps[first_step:] + + updated_num_steps= len(timesteps) -1 if callback != None: from shared.utils.loras_mutipliers import update_loras_slists @@ -280,10 +315,14 @@ def denoise( # this is ignored for schnell guidance_vec = torch.full((img.shape[0],), guidance, device=img.device, dtype=img.dtype) for i, (t_curr, t_prev) in enumerate(zip(timesteps[:-1], timesteps[1:])): - offload.set_step_no_for_lora(model, i) + offload.set_step_no_for_lora(model, first_step + i) if pipeline._interrupt: return None + if img_msk_latents is not None and denoising_strength <1. and i == first_step and morph: + latent_noise_factor = t_curr/1000 + img = original_image_latents * (1.0 - latent_noise_factor) + img * latent_noise_factor + t_vec = torch.full((img.shape[0],), t_curr, dtype=img.dtype, device=img.device) img_input = img img_input_ids = img_ids @@ -333,6 +372,14 @@ def denoise( pred = neg_pred + real_guidance_scale * (pred - neg_pred) img += (t_prev - t_curr) * pred + + if img_msk_latents is not None: + latent_noise_factor = t_prev + # noisy_image = original_image_latents * (1.0 - latent_noise_factor) + torch.randn_like(original_image_latents) * latent_noise_factor + noisy_image = original_image_latents * (1.0 - latent_noise_factor) + randn * latent_noise_factor + img = noisy_image * (1-img_msk_latents) + img_msk_latents * img + noisy_image = None + if callback is not None: preview = unpack_latent(img).transpose(0,1) callback(i, preview, False) diff --git a/models/flux/util.py b/models/flux/util.py index 0f96103..af75f62 100644 --- a/models/flux/util.py +++ b/models/flux/util.py @@ -640,6 +640,38 @@ configs = { shift_factor=0.1159, ), ), + "flux-dev-umo": ModelSpec( + repo_id="", + repo_flow="", + repo_ae="ckpts/flux_vae.safetensors", + params=FluxParams( + in_channels=64, + out_channels=64, + vec_in_dim=768, + context_in_dim=4096, + hidden_size=3072, + mlp_ratio=4.0, + num_heads=24, + depth=19, + depth_single_blocks=38, + axes_dim=[16, 56, 56], + theta=10_000, + qkv_bias=True, + guidance_embed=True, + eso= True, + ), + ae_params=AutoEncoderParams( + resolution=256, + in_channels=3, + ch=128, + out_ch=3, + ch_mult=[1, 2, 4, 4], + num_res_blocks=2, + z_channels=16, + scale_factor=0.3611, + shift_factor=0.1159, + ), + ), } diff --git a/models/qwen/pipeline_qwenimage.py b/models/qwen/pipeline_qwenimage.py index 20838f5..0897ee4 100644 --- a/models/qwen/pipeline_qwenimage.py +++ b/models/qwen/pipeline_qwenimage.py @@ -714,14 +714,14 @@ class QwenImagePipeline(): #DiffusionPipeline image_mask_latents = convert_image_to_tensor(image_mask.resize((width // 16, height // 16), resample=Image.Resampling.LANCZOS)) image_mask_latents = torch.where(image_mask_latents>-0.5, 1., 0. )[0:1] image_mask_rebuilt = image_mask_latents.repeat_interleave(16, dim=-1).repeat_interleave(16, dim=-2).unsqueeze(0) - convert_tensor_to_image( image_mask_rebuilt.squeeze(0).repeat(3,1,1)).save("mmm.png") + # convert_tensor_to_image( image_mask_rebuilt.squeeze(0).repeat(3,1,1)).save("mmm.png") image_mask_latents = image_mask_latents.reshape(1, -1, 1).to(device) prompt_image = image if image.size != (image_width, image_height): image = image.resize((image_width, image_height), resample=Image.Resampling.LANCZOS) - image.save("nnn.png") + # image.save("nnn.png") image = convert_image_to_tensor(image).unsqueeze(0).unsqueeze(2) has_neg_prompt = negative_prompt is not None or ( @@ -811,12 +811,15 @@ class QwenImagePipeline(): #DiffusionPipeline negative_txt_seq_lens = ( negative_prompt_embeds_mask.sum(dim=1).tolist() if negative_prompt_embeds_mask is not None else None ) - morph = False - if image_mask_latents is not None and denoising_strength <= 1.: - first_step = int(len(timesteps) * (1. - denoising_strength)) + morph, first_step = False, 0 + if image_mask_latents is not None: + randn = torch.randn_like(original_image_latents) + if denoising_strength < 1.: + first_step = int(len(timesteps) * (1. - denoising_strength)) if not morph: latent_noise_factor = timesteps[first_step]/1000 - latents = original_image_latents * (1.0 - latent_noise_factor) + torch.randn_like(original_image_latents) * latent_noise_factor + # latents = original_image_latents * (1.0 - latent_noise_factor) + torch.randn_like(original_image_latents) * latent_noise_factor + latents = original_image_latents * (1.0 - latent_noise_factor) + randn * latent_noise_factor timesteps = timesteps[first_step:] self.scheduler.timesteps = timesteps self.scheduler.sigmas= self.scheduler.sigmas[first_step:] @@ -831,6 +834,7 @@ class QwenImagePipeline(): #DiffusionPipeline for i, t in enumerate(timesteps): + offload.set_step_no_for_lora(self.transformer, first_step + i) if self.interrupt: continue @@ -905,7 +909,8 @@ class QwenImagePipeline(): #DiffusionPipeline if image_mask_latents is not None: next_t = timesteps[i+1] if i sliding_window_size: + if model_type in ["t2v"] and not "G" in video_prompt_type : + gr.Info(f"You have requested to Generate Sliding Windows with a Text to Video model. Unless you use the Video to Video feature this is useless as a t2v model doesn't see past frames and it will generate the same video in each new window.") + return full_video_length = video_length if video_source is None else video_length + sliding_window_overlap -1 extra = "" if full_video_length == video_length else f" including {sliding_window_overlap} added for Video Continuation" no_windows = compute_sliding_window_no(full_video_length, sliding_window_size, sliding_window_discard_last_frames, sliding_window_overlap) gr.Info(f"The Number of Frames to generate ({video_length}{extra}) is greater than the Sliding Window Size ({sliding_window_size}), {no_windows} Windows will be generated") - if "recam" in model_filename: if video_guide == None: gr.Info("You must provide a Control Video") @@ -7019,28 +7020,38 @@ def categorize_resolution(resolution_str): return group return "1440p" -def group_resolutions(resolutions, selected_resolution): +def group_resolutions(model_def, resolutions, selected_resolution): + + model_resolutions = model_def.get("resolutions", None) + if model_resolutions is not None: + selected_group ="Locked" + available_groups = [selected_group ] + selected_group_resolutions = model_resolutions + else: + grouped_resolutions = {} + for resolution in resolutions: + group = categorize_resolution(resolution[1]) + if group not in grouped_resolutions: + grouped_resolutions[group] = [] + grouped_resolutions[group].append(resolution) + + available_groups = [group for group in group_thresholds if group in grouped_resolutions] - grouped_resolutions = {} - for resolution in resolutions: - group = categorize_resolution(resolution[1]) - if group not in grouped_resolutions: - grouped_resolutions[group] = [] - grouped_resolutions[group].append(resolution) - - available_groups = [group for group in group_thresholds if group in grouped_resolutions] - - selected_group = categorize_resolution(selected_resolution) - selected_group_resolutions = grouped_resolutions.get(selected_group, []) - available_groups.reverse() + selected_group = categorize_resolution(selected_resolution) + selected_group_resolutions = grouped_resolutions.get(selected_group, []) + available_groups.reverse() return available_groups, selected_group_resolutions, selected_group def change_resolution_group(state, selected_group): model_type = state["model_type"] model_def = get_model_def(model_type) model_resolutions = model_def.get("resolutions", None) - resolution_choices, _ = get_resolution_choices(None, model_resolutions) - group_resolution_choices = [ resolution for resolution in resolution_choices if categorize_resolution(resolution[1]) == selected_group ] + resolution_choices, _ = get_resolution_choices(None, model_resolutions) + if model_resolutions is None: + group_resolution_choices = [ resolution for resolution in resolution_choices if categorize_resolution(resolution[1]) == selected_group ] + else: + last_resolution = group_resolution_choices[0][1] + return gr.update(choices= group_resolution_choices, value= last_resolution) last_resolution_per_group = state["last_resolution_per_group"] last_resolution = last_resolution_per_group.get(selected_group, "") @@ -7051,6 +7062,11 @@ def change_resolution_group(state, selected_group): def record_last_resolution(state, resolution): + + model_type = state["model_type"] + model_def = get_model_def(model_type) + model_resolutions = model_def.get("resolutions", None) + if model_resolutions is not None: return server_config["last_resolution_choice"] = resolution selected_group = categorize_resolution(resolution) last_resolution_per_group = state["last_resolution_per_group"] @@ -7482,11 +7498,13 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non frames_positions = gr.Text(value=ui_defaults.get("frames_positions","") , visible= "F" in video_prompt_type_value, scale = 2, label= "Positions of Injected Frames separated by Spaces (1=first, no position for Objects / People)" ) image_refs_relative_size = gr.Slider(20, 100, value=ui_defaults.get("image_refs_relative_size", 50), step=1, label="Rescale Internaly Image Ref (% in relation to Output Video) to change Output Composition", visible = model_def.get("any_image_refs_relative_size", False) and image_outputs) - no_background_removal = model_def.get("no_background_removal", False) + no_background_removal = model_def.get("no_background_removal", False) or image_ref_choices is None + background_removal_label = model_def.get("background_removal_label", "Remove Backgrounds behind People / Objects") + remove_background_images_ref = gr.Dropdown( choices=[ ("Keep Backgrounds behind all Reference Images", 0), - ("Remove Backgrounds only behind People / Objects except main Subject / Landscape" if (flux or qwen) else ("Remove Backgrounds behind People / Objects, keep it for Landscape or positioned Frames" if vace else "Remove Backgrounds behind People / Objects") , 1), + (background_removal_label, 1), ], value=0 if no_background_removal else ui_defaults.get("remove_background_images_ref",1), label="Automatic Removal of Background of People or Objects (Only)", scale = 3, visible= "I" in video_prompt_type_value and not no_background_removal @@ -7578,7 +7596,7 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non current_resolution_choice = ui_defaults.get("resolution","832x480") if update_form or last_resolution is None else last_resolution model_resolutions = model_def.get("resolutions", None) resolution_choices, current_resolution_choice = get_resolution_choices(current_resolution_choice, model_resolutions) - available_groups, selected_group_resolutions, selected_group = group_resolutions(resolution_choices, current_resolution_choice) + available_groups, selected_group_resolutions, selected_group = group_resolutions(model_def,resolution_choices, current_resolution_choice) resolution_group = gr.Dropdown( choices = available_groups, value= selected_group,