diff --git a/models/flux/flux_handler.py b/models/flux/flux_handler.py index 162ec4c..b8b7b9b 100644 --- a/models/flux/flux_handler.py +++ b/models/flux/flux_handler.py @@ -26,12 +26,13 @@ class family_handler(): model_def_output["no_background_removal"] = True model_def_output["image_ref_choices"] = { - "choices":[("No Reference Image", ""),("First Image is a Reference Image, and then the next ones (up to two) are Style Images", "I"), - ("Up to two Images are Style Images", "IJ")], - "default": "I", - "letters_filter": "IJ", + "choices":[("No Reference Image", ""),("First Image is a Reference Image, and then the next ones (up to two) are Style Images", "KI"), + ("Up to two Images are Style Images", "KIJ")], + "default": "KI", + "letters_filter": "KIJ", "label": "Reference Images / Style Images" } + model_def_output["lock_image_refs_ratios"] = True return model_def_output @@ -107,6 +108,16 @@ class family_handler(): pipe["feature_embedder"] = flux_model.feature_embedder return flux_model, pipe + @staticmethod + def fix_settings(base_model_type, settings_version, model_def, ui_defaults): + flux_model = model_def.get("flux-model", "flux-dev") + flux_uso = flux_model == "flux-dev-uso" + if flux_uso and settings_version < 2.29: + video_prompt_type = ui_defaults.get("video_prompt_type", "") + if "I" in video_prompt_type: + video_prompt_type = video_prompt_type.replace("I", "KI") + ui_defaults["video_prompt_type"] = video_prompt_type + @staticmethod def update_default_settings(base_model_type, model_def, ui_defaults): flux_model = model_def.get("flux-model", "flux-dev") @@ -116,6 +127,6 @@ class family_handler(): }) if model_def.get("reference_image", False): ui_defaults.update({ - "video_prompt_type": "I" if flux_uso else "KI", + "video_prompt_type": "KI", }) diff --git a/models/flux/flux_main.py b/models/flux/flux_main.py index 55a2b91..9bb8e73 100644 --- a/models/flux/flux_main.py +++ b/models/flux/flux_main.py @@ -9,6 +9,9 @@ from shared.utils.utils import calculate_new_dimensions from .sampling import denoise, get_schedule, prepare_kontext, prepare_prompt, prepare_multi_ip, unpack from .modules.layers import get_linear_split_map from transformers import SiglipVisionModel, SiglipImageProcessor +import torchvision.transforms.functional as TVF +import math +from shared.utils.utils import convert_image_to_tensor, convert_tensor_to_image from .util import ( aspect_ratio_to_height_width, @@ -21,6 +24,44 @@ from .util import ( from PIL import Image +def resize_and_centercrop_image(image, target_height_ref1, target_width_ref1): + target_height_ref1 = int(target_height_ref1 // 64 * 64) + target_width_ref1 = int(target_width_ref1 // 64 * 64) + h, w = image.shape[-2:] + if h < target_height_ref1 or w < target_width_ref1: + # 计算长宽比 + aspect_ratio = w / h + if h < target_height_ref1: + new_h = target_height_ref1 + new_w = new_h * aspect_ratio + if new_w < target_width_ref1: + new_w = target_width_ref1 + new_h = new_w / aspect_ratio + else: + new_w = target_width_ref1 + new_h = new_w / aspect_ratio + if new_h < target_height_ref1: + new_h = target_height_ref1 + new_w = new_h * aspect_ratio + else: + aspect_ratio = w / h + tgt_aspect_ratio = target_width_ref1 / target_height_ref1 + if aspect_ratio > tgt_aspect_ratio: + new_h = target_height_ref1 + new_w = new_h * aspect_ratio + else: + new_w = target_width_ref1 + new_h = new_w / aspect_ratio + # 使用 TVF.resize 进行图像缩放 + image = TVF.resize(image, (math.ceil(new_h), math.ceil(new_w))) + # 计算中心裁剪的参数 + top = (image.shape[-2] - target_height_ref1) // 2 + left = (image.shape[-1] - target_width_ref1) // 2 + # 使用 TVF.crop 进行中心裁剪 + image = TVF.crop(image, top, left, target_height_ref1, target_width_ref1) + return image + + def stitch_images(img1, img2): # Resize img2 to match img1's height width1, height1 = img1.size @@ -129,11 +170,11 @@ class model_factory: if n_prompt is None or len(n_prompt) == 0: n_prompt = "low quality, ugly, unfinished, out of focus, deformed, disfigure, blurry, smudged, restricted palette, flat colors" device="cuda" flux_dev_uso = self.name in ['flux-dev-uso'] - image_stiching = not self.name in ['flux-dev-uso'] - + image_stiching = not self.name in ['flux-dev-uso'] #and False + # image_refs_relative_size = 100 + crop = False input_ref_images = [] if input_ref_images is None else input_ref_images[:] ref_style_imgs = [] - if "I" in video_prompt_type and len(input_ref_images) > 0: if flux_dev_uso : if "J" in video_prompt_type: @@ -148,7 +189,7 @@ class model_factory: if "K" in video_prompt_type : w, h = input_ref_images[0].size height, width = calculate_new_dimensions(height, width, h, w, fit_into_canvas) - + # actual rescale will happen in prepare_kontext for new_img in input_ref_images[1:]: stiched = stitch_images(stiched, new_img) input_ref_images = [stiched] @@ -157,14 +198,24 @@ class model_factory: if "K" in video_prompt_type: # image latents tiling method w, h = input_ref_images[0].size - height, width = calculate_new_dimensions(height, width, h, w, fit_into_canvas) - input_ref_images[0] = input_ref_images[0].resize((width, height), resample=Image.Resampling.LANCZOS) + if crop : + img = convert_image_to_tensor(input_ref_images[0]) + img = resize_and_centercrop_image(img, height, width) + input_ref_images[0] = convert_tensor_to_image(img) + else: + height, width = calculate_new_dimensions(height, width, h, w, fit_into_canvas) + input_ref_images[0] = input_ref_images[0].resize((width, height), resample=Image.Resampling.LANCZOS) first_ref = 1 for i in range(first_ref,len(input_ref_images)): w, h = input_ref_images[i].size - image_height, image_width = calculate_new_dimensions(int(height*image_refs_relative_size/100), int(width*image_refs_relative_size/100), h, w, fit_into_canvas) - input_ref_images[0] = input_ref_images[0].resize((image_width, image_height), resample=Image.Resampling.LANCZOS) + if crop: + img = convert_image_to_tensor(input_ref_images[i]) + img = resize_and_centercrop_image(img, int(height*image_refs_relative_size/100), int(width*image_refs_relative_size/100)) + input_ref_images[i] = convert_tensor_to_image(img) + else: + image_height, image_width = calculate_new_dimensions(int(height*image_refs_relative_size/100), int(width*image_refs_relative_size/100), h, w, fit_into_canvas) + input_ref_images[i] = input_ref_images[i].resize((image_width, image_height), resample=Image.Resampling.LANCZOS) else: input_ref_images = None diff --git a/models/flux/sampling.py b/models/flux/sampling.py index 5534e9f..f43ae15 100644 --- a/models/flux/sampling.py +++ b/models/flux/sampling.py @@ -153,7 +153,6 @@ def prepare_kontext( # Kontext is trained on specific resolutions, using one of them is recommended _, width, height = min((abs(aspect_ratio - w / h), w, h) for w, h in PREFERED_KONTEXT_RESOLUTIONS) - width = 2 * int(width / 16) height = 2 * int(height / 16) diff --git a/models/qwen/qwen_handler.py b/models/qwen/qwen_handler.py index c6004e1..6fc488a 100644 --- a/models/qwen/qwen_handler.py +++ b/models/qwen/qwen_handler.py @@ -15,6 +15,7 @@ class family_handler(): ("Default", "default"), ("Lightning", "lightning")], "guidance_max_phases" : 1, + "lock_image_refs_ratios": True, } diff --git a/models/wan/wan_handler.py b/models/wan/wan_handler.py index 6d91fe2..9adc3a8 100644 --- a/models/wan/wan_handler.py +++ b/models/wan/wan_handler.py @@ -117,6 +117,8 @@ class family_handler(): extra_model_def["no_background_removal"] = True # extra_model_def["at_least_one_image_ref_needed"] = True + if base_model_type in ["standin"] or vace_class: + extra_model_def["lock_image_refs_ratios"] = True # if base_model_type in ["phantom_1.3B", "phantom_14B"]: # extra_model_def["one_image_ref_needed"] = True diff --git a/shared/utils/utils.py b/shared/utils/utils.py index a55807a..7ddf1eb 100644 --- a/shared/utils/utils.py +++ b/shared/utils/utils.py @@ -18,11 +18,11 @@ import os import tempfile import subprocess import json - +from functools import lru_cache from PIL import Image - +video_info_cache = [] def seed_everything(seed: int): random.seed(seed) np.random.seed(seed) @@ -77,7 +77,9 @@ def truncate_for_filesystem(s, max_bytes=255): else: r = m - 1 return s[:l] +@lru_cache(maxsize=100) def get_video_info(video_path): + global video_info_cache import cv2 cap = cv2.VideoCapture(video_path) diff --git a/wgp.py b/wgp.py index f11afa9..a3cc510 100644 --- a/wgp.py +++ b/wgp.py @@ -60,8 +60,8 @@ AUTOSAVE_FILENAME = "queue.zip" PROMPT_VARS_MAX = 10 target_mmgp_version = "3.5.12" -WanGP_version = "8.32" -settings_version = 2.28 +WanGP_version = "8.33" +settings_version = 2.29 max_source_video_frames = 3000 prompt_enhancer_image_caption_model, prompt_enhancer_image_caption_processor, prompt_enhancer_llm_model, prompt_enhancer_llm_tokenizer = None, None, None, None @@ -3313,6 +3313,7 @@ def select_video(state, input_file_list, event_data: gr.EventData): if not all_letters(src, pos): return False if neg is not None and any_letters(src, neg): return False return True + image_outputs = configs.get("image_mode",0) == 1 map_video_prompt = {"V" : "Control Video", ("VA", "U") : "Mask Video", "I" : "Reference Images"} map_image_prompt = {"V" : "Source Video", "L" : "Last Video", "S" : "Start Image", "E" : "End Image"} map_audio_prompt = {"A" : "Audio Source", "B" : "Audio Source #2"} @@ -3364,6 +3365,7 @@ def select_video(state, input_file_list, event_data: gr.EventData): if multiple_submodels: video_guidance_scale += f" + Model Switch at {video_switch_threshold if video_model_switch_phase ==1 else video_switch_threshold2}" video_flow_shift = configs.get("flow_shift", None) + if image_outputs: video_flow_shift = None video_video_guide_outpainting = configs.get("video_guide_outpainting", "") video_outpainting = "" if len(video_video_guide_outpainting) > 0 and not video_video_guide_outpainting.startswith("#") \ @@ -4545,7 +4547,8 @@ def generate_video( send_cmd("progress", [0, get_latest_status(state, "Removing Images References Background")]) os.environ["U2NET_HOME"] = os.path.join(os.getcwd(), "ckpts", "rembg") from shared.utils.utils import resize_and_remove_background - image_refs[nb_frames_positions:] = resize_and_remove_background(image_refs[nb_frames_positions:] , width, height, remove_background_images_ref > 0, any_background_ref, fit_into_canvas= not (any_background_ref or vace or standin) ) # no fit for vace ref images as it is done later + # keep image ratios if there is a background image ref (we will let the model preprocessor decide what to do) but remove bg if requested + image_refs[nb_frames_positions:] = resize_and_remove_background(image_refs[nb_frames_positions:] , width, height, remove_background_images_ref > 0, any_background_ref, fit_into_canvas= not (any_background_ref or model_def.get("lock_image_refs_ratios", False)) ) # no fit for vace ref images as it is done later update_task_thumbnails(task, locals()) send_cmd("output") joint_pass = boost ==1 #and profile != 1 and profile != 3 @@ -5912,6 +5915,8 @@ def prepare_inputs_dict(target, inputs, model_type = None, model_filename = None if target == "settings": return inputs + image_outputs = inputs.get("image_mode",0) == 1 + pop=[] if "force_fps" in inputs and len(inputs["force_fps"])== 0: pop += ["force_fps"] @@ -5977,7 +5982,7 @@ def prepare_inputs_dict(target, inputs, model_type = None, model_filename = None if guidance_max_phases < 3 or guidance_phases < 3: pop += ["guidance3_scale", "switch_threshold2", "model_switch_phase"] - if ltxv: + if ltxv or image_outputs: pop += ["flow_shift"] if model_def.get("no_negative_prompt", False) : @@ -6876,11 +6881,15 @@ def detect_auto_save_form(state, evt:gr.SelectData): return gr.update() def compute_video_length_label(fps, current_video_length): - return f"Number of frames ({fps} frames = 1s), current duration: {(current_video_length / fps):.1f}s", + if fps is None: + return f"Number of frames" + else: + return f"Number of frames ({fps} frames = 1s), current duration: {(current_video_length / fps):.1f}s", -def refresh_video_length_label(state, current_video_length): - fps = get_model_fps(get_base_model_type(state["model_type"])) - return gr.update(label= compute_video_length_label(fps, current_video_length)) +def refresh_video_length_label(state, current_video_length, force_fps, video_guide, video_source): + base_model_type = get_base_model_type(state["model_type"]) + computed_fps = get_computed_fps(force_fps, base_model_type , video_guide, video_source ) + return gr.update(label= compute_video_length_label(computed_fps, current_video_length)) def generate_video_tab(update_form = False, state_dict = None, ui_defaults = None, model_family = None, model_choice = None, header = None, main = None, main_tabs= None): global inputs_names #, advanced @@ -7469,8 +7478,9 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non current_video_length = ui_defaults.get("video_length", 81 if get_model_family(base_model_type)=="wan" else 97) + computed_fps = get_computed_fps(ui_defaults.get("force_fps",""), base_model_type , video_guide, video_source ) video_length = gr.Slider(min_frames, get_max_frames(737 if test_any_sliding_window(base_model_type) else 337), value=current_video_length, - step=frames_step, label=compute_video_length_label(fps, current_video_length) , visible = True, interactive= True) + step=frames_step, label=compute_video_length_label(computed_fps, current_video_length) , visible = True, interactive= True) with gr.Row(visible = not lock_inference_steps) as inference_steps_row: num_inference_steps = gr.Slider(1, 100, value=ui_defaults.get("num_inference_steps",30), step=1, label="Number of Inference Steps", visible = True) @@ -7643,7 +7653,7 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non with gr.Column(visible = (t2v or vace) and not fantasy) as audio_prompt_type_remux_row: - gr.Markdown("You may transfer the exising audio tracks of a Control Video") + gr.Markdown("You may transfer the existing audio tracks of a Control Video") audio_prompt_type_remux = gr.Dropdown( choices=[ ("No Remux", ""), @@ -7955,7 +7965,7 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non extra_inputs = prompt_vars + [wizard_prompt, wizard_variables_var, wizard_prompt_activated_var, video_prompt_column, image_prompt_column, prompt_column_advanced, prompt_column_wizard_vars, prompt_column_wizard, lset_name, save_lset_prompt_drop, advanced_row, speed_tab, audio_tab, mmaudio_col, quality_tab, sliding_window_tab, misc_tab, prompt_enhancer_row, inference_steps_row, skip_layer_guidance_row, audio_guide_row, RIFLEx_setting_col, - video_prompt_type_video_guide, video_prompt_type_video_guide_alt, video_prompt_type_video_mask, video_prompt_type_image_refs, apg_col, audio_prompt_type_sources, audio_prompt_type_remux_row, + video_prompt_type_video_guide, video_prompt_type_video_guide_alt, video_prompt_type_video_mask, video_prompt_type_image_refs, apg_col, audio_prompt_type_sources, audio_prompt_type_remux, audio_prompt_type_remux_row, video_guide_outpainting_col,video_guide_outpainting_top, video_guide_outpainting_bottom, video_guide_outpainting_left, video_guide_outpainting_right, video_guide_outpainting_checkbox, video_guide_outpainting_row, show_advanced, video_info_to_control_video_btn, video_info_to_video_source_btn, sample_solver_row, video_buttons_row, image_buttons_row, video_postprocessing_tab, audio_remuxing_tab, PP_MMAudio_row, PP_custom_audio_row, @@ -7975,7 +7985,8 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non resolution.change(fn=record_last_resolution, inputs=[state, resolution]) - video_length.release(fn=refresh_video_length_label, inputs=[state, video_length ], outputs = video_length, trigger_mode="always_last" ) + # video_length.release(fn=refresh_video_length_label, inputs=[state, video_length ], outputs = video_length, trigger_mode="always_last" ) + gr.on(triggers=[video_length.release, force_fps.change, video_guide.change, video_source.change], fn=refresh_video_length_label, inputs=[state, video_length, force_fps, video_guide, video_source] , outputs = video_length, trigger_mode="always_last" ) guidance_phases.change(fn=change_guidance_phases, inputs= [state, guidance_phases], outputs =[model_switch_phase, guidance_phases_row, switch_threshold, switch_threshold2, guidance2_scale, guidance3_scale ]) audio_prompt_type_remux.change(fn=refresh_audio_prompt_type_remux, inputs=[state, audio_prompt_type, audio_prompt_type_remux], outputs=[audio_prompt_type]) audio_prompt_type_sources.change(fn=refresh_audio_prompt_type_sources, inputs=[state, audio_prompt_type, audio_prompt_type_sources], outputs=[audio_prompt_type, audio_guide, audio_guide2, speakers_locations_row])