fixes and polish

2026-01-11 16:53:34 +00:00 · 2025-09-03 19:39:17 +02:00 · 2025-09-03 19:39:17 +02:00 · a60eea2371
commit a60eea2371
parent 0871a3be58
7 changed files with 105 additions and 28 deletions
--- a/models/flux/flux_handler.py
+++ b/models/flux/flux_handler.py
@ -26,12 +26,13 @@ class family_handler():
            model_def_output["no_background_removal"] = True

            model_def_output["image_ref_choices"] = {
-                "choices":[("No Reference Image", ""),("First Image is a Reference Image, and then the next ones (up to two) are Style Images", "I"),
-                            ("Up to two Images are Style Images", "IJ")],
-                "default": "I",
-                "letters_filter": "IJ",
+                "choices":[("No Reference Image", ""),("First Image is a Reference Image, and then the next ones (up to two) are Style Images", "KI"),
+                            ("Up to two Images are Style Images", "KIJ")],
+                "default": "KI",
+                "letters_filter": "KIJ",
                "label": "Reference Images / Style Images"
            }
+        model_def_output["lock_image_refs_ratios"] = True

        return model_def_output

@ -107,6 +108,16 @@ class family_handler():
            pipe["feature_embedder"] = flux_model.feature_embedder 
        return flux_model, pipe

+    @staticmethod
+    def fix_settings(base_model_type, settings_version, model_def, ui_defaults):
+        flux_model = model_def.get("flux-model", "flux-dev")
+        flux_uso = flux_model == "flux-dev-uso"
+        if flux_uso and settings_version < 2.29:
+            video_prompt_type = ui_defaults.get("video_prompt_type", "")
+            if "I" in video_prompt_type:
+                video_prompt_type = video_prompt_type.replace("I", "KI")
+                ui_defaults["video_prompt_type"] = video_prompt_type 
+
    @staticmethod
    def update_default_settings(base_model_type, model_def, ui_defaults):
        flux_model = model_def.get("flux-model", "flux-dev")
@ -116,6 +127,6 @@ class family_handler():
        })            
        if model_def.get("reference_image", False):
            ui_defaults.update({
-                "video_prompt_type": "I" if flux_uso else "KI",
+                "video_prompt_type": "KI",
            })

--- a/models/flux/flux_main.py
+++ b/models/flux/flux_main.py
@ -9,6 +9,9 @@ from shared.utils.utils import calculate_new_dimensions
 from .sampling import denoise, get_schedule, prepare_kontext, prepare_prompt, prepare_multi_ip, unpack
 from .modules.layers import get_linear_split_map
 from transformers import SiglipVisionModel, SiglipImageProcessor
+import torchvision.transforms.functional as TVF
+import math
+from shared.utils.utils import convert_image_to_tensor, convert_tensor_to_image

 from .util import (
    aspect_ratio_to_height_width,
@ -21,6 +24,44 @@ from .util import (

 from PIL import Image

+def resize_and_centercrop_image(image, target_height_ref1, target_width_ref1):
+    target_height_ref1 = int(target_height_ref1 // 64 * 64)
+    target_width_ref1 = int(target_width_ref1 // 64 * 64)
+    h, w = image.shape[-2:]
+    if h < target_height_ref1 or w < target_width_ref1:
+        # 计算长宽比
+        aspect_ratio = w / h
+        if h < target_height_ref1:
+            new_h = target_height_ref1
+            new_w = new_h * aspect_ratio
+            if new_w < target_width_ref1:
+                new_w = target_width_ref1
+                new_h = new_w / aspect_ratio
+        else:
+            new_w = target_width_ref1
+            new_h = new_w / aspect_ratio
+            if new_h < target_height_ref1:
+                new_h = target_height_ref1
+                new_w = new_h * aspect_ratio
+    else:
+        aspect_ratio = w / h
+        tgt_aspect_ratio = target_width_ref1 / target_height_ref1
+        if aspect_ratio > tgt_aspect_ratio:
+            new_h = target_height_ref1
+            new_w = new_h * aspect_ratio
+        else:
+            new_w = target_width_ref1
+            new_h = new_w / aspect_ratio
+    # 使用 TVF.resize 进行图像缩放
+    image = TVF.resize(image, (math.ceil(new_h), math.ceil(new_w)))
+    # 计算中心裁剪的参数
+    top = (image.shape[-2] - target_height_ref1) // 2
+    left = (image.shape[-1] - target_width_ref1) // 2
+    # 使用 TVF.crop 进行中心裁剪
+    image = TVF.crop(image, top, left, target_height_ref1, target_width_ref1)
+    return image
+
+
 def stitch_images(img1, img2):
    # Resize img2 to match img1's height
    width1, height1 = img1.size
@ -129,11 +170,11 @@ class model_factory:
            if n_prompt is None or len(n_prompt) == 0: n_prompt = "low quality, ugly, unfinished, out of focus, deformed, disfigure, blurry, smudged, restricted palette, flat colors"
            device="cuda"
            flux_dev_uso = self.name in ['flux-dev-uso']
-            image_stiching =  not self.name in ['flux-dev-uso']
-
+            image_stiching =  not self.name in ['flux-dev-uso'] #and False
+            # image_refs_relative_size = 100
+            crop = False
            input_ref_images = [] if input_ref_images is None else input_ref_images[:]
            ref_style_imgs = []
-
            if "I" in video_prompt_type and len(input_ref_images) > 0: 
                if flux_dev_uso :
                    if "J" in video_prompt_type:
@ -148,7 +189,7 @@ class model_factory:
                    if "K" in video_prompt_type :
                        w, h = input_ref_images[0].size
                        height, width = calculate_new_dimensions(height, width, h, w, fit_into_canvas)
-
+                        # actual rescale will happen in prepare_kontext
                    for new_img in input_ref_images[1:]:
                        stiched = stitch_images(stiched, new_img)
                    input_ref_images  = [stiched]
@ -157,14 +198,24 @@ class model_factory:
                    if "K" in video_prompt_type:
                        # image latents tiling method
                        w, h = input_ref_images[0].size
+                        if crop :
+                            img = convert_image_to_tensor(input_ref_images[0])
+                            img = resize_and_centercrop_image(img, height, width)                       
+                            input_ref_images[0] = convert_tensor_to_image(img)                    
+                        else:
                            height, width = calculate_new_dimensions(height, width, h, w, fit_into_canvas)
                            input_ref_images[0] = input_ref_images[0].resize((width, height), resample=Image.Resampling.LANCZOS) 
                        first_ref = 1

                    for i in range(first_ref,len(input_ref_images)):
                        w, h = input_ref_images[i].size
+                        if crop:
+                            img = convert_image_to_tensor(input_ref_images[i])
+                            img = resize_and_centercrop_image(img, int(height*image_refs_relative_size/100), int(width*image_refs_relative_size/100)) 
+                            input_ref_images[i] = convert_tensor_to_image(img)                    
+                        else:
                            image_height, image_width = calculate_new_dimensions(int(height*image_refs_relative_size/100), int(width*image_refs_relative_size/100), h, w, fit_into_canvas)
-                        input_ref_images[0] = input_ref_images[0].resize((image_width, image_height), resample=Image.Resampling.LANCZOS) 
+                            input_ref_images[i] = input_ref_images[i].resize((image_width, image_height), resample=Image.Resampling.LANCZOS) 
            else:
                input_ref_images = None

--- a/models/flux/sampling.py
+++ b/models/flux/sampling.py
@ -153,7 +153,6 @@ def prepare_kontext(

        # Kontext is trained on specific resolutions, using one of them is recommended
        _, width, height = min((abs(aspect_ratio - w / h), w, h) for w, h in PREFERED_KONTEXT_RESOLUTIONS)
-
        width = 2 * int(width / 16)
        height = 2 * int(height / 16)

--- a/models/qwen/qwen_handler.py
+++ b/models/qwen/qwen_handler.py
@ -15,6 +15,7 @@ class family_handler():
                            ("Default", "default"),
                            ("Lightning", "lightning")],
            "guidance_max_phases" : 1,
+            "lock_image_refs_ratios": True,
        }


--- a/models/wan/wan_handler.py
+++ b/models/wan/wan_handler.py
@ -117,6 +117,8 @@ class family_handler():
            extra_model_def["no_background_removal"] = True
            # extra_model_def["at_least_one_image_ref_needed"] = True

+        if base_model_type in ["standin"] or vace_class: 
+            extra_model_def["lock_image_refs_ratios"] = True

        # if base_model_type in ["phantom_1.3B", "phantom_14B"]: 
        #     extra_model_def["one_image_ref_needed"] = True
--- a/shared/utils/utils.py
+++ b/shared/utils/utils.py
@ -18,11 +18,11 @@ import os
 import tempfile
 import subprocess
 import json
-
+from functools import lru_cache


 from PIL import Image
-
+video_info_cache = []
 def seed_everything(seed: int):
    random.seed(seed)
    np.random.seed(seed)
@ -77,7 +77,9 @@ def truncate_for_filesystem(s, max_bytes=255):
        else: r = m - 1
    return s[:l]

+@lru_cache(maxsize=100)
 def get_video_info(video_path):
+    global video_info_cache
    import cv2
    cap = cv2.VideoCapture(video_path)
    
--- a/wgp.py
+++ b/wgp.py
@ -60,8 +60,8 @@ AUTOSAVE_FILENAME = "queue.zip"
 PROMPT_VARS_MAX = 10

 target_mmgp_version = "3.5.12"
-WanGP_version = "8.32"
-settings_version = 2.28
+WanGP_version = "8.33"
+settings_version = 2.29
 max_source_video_frames = 3000
 prompt_enhancer_image_caption_model, prompt_enhancer_image_caption_processor, prompt_enhancer_llm_model, prompt_enhancer_llm_tokenizer = None, None, None, None

@ -3313,6 +3313,7 @@ def select_video(state, input_file_list, event_data: gr.EventData):
                if not all_letters(src, pos): return False
                if neg is not None and any_letters(src, neg): return False
                return True
+            image_outputs = configs.get("image_mode",0) == 1
            map_video_prompt  = {"V" : "Control Video", ("VA", "U") : "Mask Video", "I" : "Reference Images"}
            map_image_prompt  = {"V" : "Source Video", "L" : "Last Video", "S" : "Start Image", "E" : "End Image"}
            map_audio_prompt  = {"A" : "Audio Source", "B" : "Audio Source #2"}
@ -3364,6 +3365,7 @@ def select_video(state, input_file_list, event_data: gr.EventData):
                    if multiple_submodels:
                        video_guidance_scale += f" + Model Switch at {video_switch_threshold if video_model_switch_phase ==1 else video_switch_threshold2}"
            video_flow_shift = configs.get("flow_shift", None)
+            if image_outputs: video_flow_shift = None 
            video_video_guide_outpainting = configs.get("video_guide_outpainting", "")
            video_outpainting = ""
            if len(video_video_guide_outpainting) > 0  and not video_video_guide_outpainting.startswith("#") \
@ -4545,7 +4547,8 @@ def generate_video(
                send_cmd("progress", [0, get_latest_status(state, "Removing Images References Background")])
            os.environ["U2NET_HOME"] = os.path.join(os.getcwd(), "ckpts", "rembg")
            from shared.utils.utils import resize_and_remove_background
-            image_refs[nb_frames_positions:]  = resize_and_remove_background(image_refs[nb_frames_positions:] , width, height, remove_background_images_ref > 0, any_background_ref, fit_into_canvas= not (any_background_ref or vace or standin) ) # no fit for vace ref images as it is done later
+            # keep image ratios if there is a background image ref (we will let the model preprocessor decide what to do) but remove bg if requested
+            image_refs[nb_frames_positions:]  = resize_and_remove_background(image_refs[nb_frames_positions:] , width, height, remove_background_images_ref > 0, any_background_ref, fit_into_canvas= not (any_background_ref or model_def.get("lock_image_refs_ratios", False)) ) # no fit for vace ref images as it is done later
            update_task_thumbnails(task, locals())
            send_cmd("output")
    joint_pass = boost ==1 #and profile != 1 and profile != 3  
@ -5912,6 +5915,8 @@ def prepare_inputs_dict(target, inputs, model_type = None, model_filename = None
    if target == "settings":
        return inputs

+    image_outputs = inputs.get("image_mode",0) == 1
+
    pop=[]    
    if "force_fps" in inputs and len(inputs["force_fps"])== 0:
        pop += ["force_fps"]
@ -5977,7 +5982,7 @@ def prepare_inputs_dict(target, inputs, model_type = None, model_filename = None
    if guidance_max_phases < 3 or guidance_phases < 3:
        pop += ["guidance3_scale", "switch_threshold2", "model_switch_phase"]

-    if ltxv:
+    if ltxv or image_outputs:
        pop += ["flow_shift"]

    if model_def.get("no_negative_prompt", False) :
@ -6876,11 +6881,15 @@ def detect_auto_save_form(state, evt:gr.SelectData):
        return gr.update()

 def compute_video_length_label(fps, current_video_length):
+    if fps is None:
+        return f"Number of frames"
+    else:
        return f"Number of frames ({fps} frames = 1s), current duration: {(current_video_length / fps):.1f}s",  

-def refresh_video_length_label(state, current_video_length):
-    fps = get_model_fps(get_base_model_type(state["model_type"]))
-    return gr.update(label= compute_video_length_label(fps, current_video_length))
+def refresh_video_length_label(state, current_video_length, force_fps, video_guide, video_source):
+    base_model_type = get_base_model_type(state["model_type"])
+    computed_fps = get_computed_fps(force_fps, base_model_type , video_guide, video_source )
+    return gr.update(label= compute_video_length_label(computed_fps, current_video_length))

 def generate_video_tab(update_form = False, state_dict = None, ui_defaults = None, model_family = None, model_choice = None, header = None, main = None, main_tabs= None):
    global inputs_names #, advanced
@ -7469,8 +7478,9 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non
                    
                    current_video_length = ui_defaults.get("video_length", 81 if get_model_family(base_model_type)=="wan" else 97)

+                    computed_fps = get_computed_fps(ui_defaults.get("force_fps",""), base_model_type , video_guide, video_source )
                    video_length = gr.Slider(min_frames, get_max_frames(737 if test_any_sliding_window(base_model_type) else 337), value=current_video_length, 
-                         step=frames_step, label=compute_video_length_label(fps, current_video_length) , visible = True, interactive= True)
+                         step=frames_step, label=compute_video_length_label(computed_fps, current_video_length) , visible = True, interactive= True)

            with gr.Row(visible = not lock_inference_steps) as inference_steps_row:                                       
                num_inference_steps = gr.Slider(1, 100, value=ui_defaults.get("num_inference_steps",30), step=1, label="Number of Inference Steps", visible = True)
@ -7643,7 +7653,7 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non
                            

                    with gr.Column(visible = (t2v or vace) and not fantasy) as audio_prompt_type_remux_row:
-                        gr.Markdown("<B>You may transfer the exising audio tracks of a Control Video</B>")
+                        gr.Markdown("<B>You may transfer the existing audio tracks of a Control Video</B>")
                        audio_prompt_type_remux = gr.Dropdown(
                            choices=[
                                ("No Remux", ""),
@ -7955,7 +7965,7 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non
        extra_inputs = prompt_vars + [wizard_prompt, wizard_variables_var, wizard_prompt_activated_var, video_prompt_column, image_prompt_column,
                                      prompt_column_advanced, prompt_column_wizard_vars, prompt_column_wizard, lset_name, save_lset_prompt_drop, advanced_row, speed_tab, audio_tab, mmaudio_col, quality_tab,
                                      sliding_window_tab, misc_tab, prompt_enhancer_row, inference_steps_row, skip_layer_guidance_row, audio_guide_row, RIFLEx_setting_col,
-                                      video_prompt_type_video_guide, video_prompt_type_video_guide_alt, video_prompt_type_video_mask, video_prompt_type_image_refs, apg_col, audio_prompt_type_sources, audio_prompt_type_remux_row,
+                                      video_prompt_type_video_guide, video_prompt_type_video_guide_alt, video_prompt_type_video_mask, video_prompt_type_image_refs, apg_col, audio_prompt_type_sources,  audio_prompt_type_remux, audio_prompt_type_remux_row,
                                      video_guide_outpainting_col,video_guide_outpainting_top, video_guide_outpainting_bottom, video_guide_outpainting_left, video_guide_outpainting_right,
                                      video_guide_outpainting_checkbox, video_guide_outpainting_row, show_advanced, video_info_to_control_video_btn, video_info_to_video_source_btn, sample_solver_row,
                                      video_buttons_row, image_buttons_row, video_postprocessing_tab, audio_remuxing_tab, PP_MMAudio_row, PP_custom_audio_row, 
@ -7975,7 +7985,8 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non
            resolution.change(fn=record_last_resolution, inputs=[state, resolution])


-            video_length.release(fn=refresh_video_length_label, inputs=[state, video_length ], outputs = video_length, trigger_mode="always_last" )
+            # video_length.release(fn=refresh_video_length_label, inputs=[state, video_length ], outputs = video_length, trigger_mode="always_last" )
+            gr.on(triggers=[video_length.release, force_fps.change, video_guide.change, video_source.change], fn=refresh_video_length_label, inputs=[state, video_length, force_fps, video_guide, video_source] , outputs = video_length, trigger_mode="always_last" )
            guidance_phases.change(fn=change_guidance_phases, inputs= [state, guidance_phases], outputs =[model_switch_phase, guidance_phases_row, switch_threshold, switch_threshold2, guidance2_scale, guidance3_scale ])
            audio_prompt_type_remux.change(fn=refresh_audio_prompt_type_remux, inputs=[state, audio_prompt_type, audio_prompt_type_remux], outputs=[audio_prompt_type])
            audio_prompt_type_sources.change(fn=refresh_audio_prompt_type_sources, inputs=[state, audio_prompt_type, audio_prompt_type_sources], outputs=[audio_prompt_type, audio_guide, audio_guide2, speakers_locations_row])