fixes and polish

This commit is contained in:
DeepBeepMeep 2025-09-03 19:39:17 +02:00
parent 0871a3be58
commit a60eea2371
7 changed files with 105 additions and 28 deletions

View File

@ -26,12 +26,13 @@ class family_handler():
model_def_output["no_background_removal"] = True
model_def_output["image_ref_choices"] = {
"choices":[("No Reference Image", ""),("First Image is a Reference Image, and then the next ones (up to two) are Style Images", "I"),
("Up to two Images are Style Images", "IJ")],
"default": "I",
"letters_filter": "IJ",
"choices":[("No Reference Image", ""),("First Image is a Reference Image, and then the next ones (up to two) are Style Images", "KI"),
("Up to two Images are Style Images", "KIJ")],
"default": "KI",
"letters_filter": "KIJ",
"label": "Reference Images / Style Images"
}
model_def_output["lock_image_refs_ratios"] = True
return model_def_output
@ -107,6 +108,16 @@ class family_handler():
pipe["feature_embedder"] = flux_model.feature_embedder
return flux_model, pipe
@staticmethod
def fix_settings(base_model_type, settings_version, model_def, ui_defaults):
flux_model = model_def.get("flux-model", "flux-dev")
flux_uso = flux_model == "flux-dev-uso"
if flux_uso and settings_version < 2.29:
video_prompt_type = ui_defaults.get("video_prompt_type", "")
if "I" in video_prompt_type:
video_prompt_type = video_prompt_type.replace("I", "KI")
ui_defaults["video_prompt_type"] = video_prompt_type
@staticmethod
def update_default_settings(base_model_type, model_def, ui_defaults):
flux_model = model_def.get("flux-model", "flux-dev")
@ -116,6 +127,6 @@ class family_handler():
})
if model_def.get("reference_image", False):
ui_defaults.update({
"video_prompt_type": "I" if flux_uso else "KI",
"video_prompt_type": "KI",
})

View File

@ -9,6 +9,9 @@ from shared.utils.utils import calculate_new_dimensions
from .sampling import denoise, get_schedule, prepare_kontext, prepare_prompt, prepare_multi_ip, unpack
from .modules.layers import get_linear_split_map
from transformers import SiglipVisionModel, SiglipImageProcessor
import torchvision.transforms.functional as TVF
import math
from shared.utils.utils import convert_image_to_tensor, convert_tensor_to_image
from .util import (
aspect_ratio_to_height_width,
@ -21,6 +24,44 @@ from .util import (
from PIL import Image
def resize_and_centercrop_image(image, target_height_ref1, target_width_ref1):
target_height_ref1 = int(target_height_ref1 // 64 * 64)
target_width_ref1 = int(target_width_ref1 // 64 * 64)
h, w = image.shape[-2:]
if h < target_height_ref1 or w < target_width_ref1:
# 计算长宽比
aspect_ratio = w / h
if h < target_height_ref1:
new_h = target_height_ref1
new_w = new_h * aspect_ratio
if new_w < target_width_ref1:
new_w = target_width_ref1
new_h = new_w / aspect_ratio
else:
new_w = target_width_ref1
new_h = new_w / aspect_ratio
if new_h < target_height_ref1:
new_h = target_height_ref1
new_w = new_h * aspect_ratio
else:
aspect_ratio = w / h
tgt_aspect_ratio = target_width_ref1 / target_height_ref1
if aspect_ratio > tgt_aspect_ratio:
new_h = target_height_ref1
new_w = new_h * aspect_ratio
else:
new_w = target_width_ref1
new_h = new_w / aspect_ratio
# 使用 TVF.resize 进行图像缩放
image = TVF.resize(image, (math.ceil(new_h), math.ceil(new_w)))
# 计算中心裁剪的参数
top = (image.shape[-2] - target_height_ref1) // 2
left = (image.shape[-1] - target_width_ref1) // 2
# 使用 TVF.crop 进行中心裁剪
image = TVF.crop(image, top, left, target_height_ref1, target_width_ref1)
return image
def stitch_images(img1, img2):
# Resize img2 to match img1's height
width1, height1 = img1.size
@ -129,11 +170,11 @@ class model_factory:
if n_prompt is None or len(n_prompt) == 0: n_prompt = "low quality, ugly, unfinished, out of focus, deformed, disfigure, blurry, smudged, restricted palette, flat colors"
device="cuda"
flux_dev_uso = self.name in ['flux-dev-uso']
image_stiching = not self.name in ['flux-dev-uso']
image_stiching = not self.name in ['flux-dev-uso'] #and False
# image_refs_relative_size = 100
crop = False
input_ref_images = [] if input_ref_images is None else input_ref_images[:]
ref_style_imgs = []
if "I" in video_prompt_type and len(input_ref_images) > 0:
if flux_dev_uso :
if "J" in video_prompt_type:
@ -148,7 +189,7 @@ class model_factory:
if "K" in video_prompt_type :
w, h = input_ref_images[0].size
height, width = calculate_new_dimensions(height, width, h, w, fit_into_canvas)
# actual rescale will happen in prepare_kontext
for new_img in input_ref_images[1:]:
stiched = stitch_images(stiched, new_img)
input_ref_images = [stiched]
@ -157,14 +198,24 @@ class model_factory:
if "K" in video_prompt_type:
# image latents tiling method
w, h = input_ref_images[0].size
if crop :
img = convert_image_to_tensor(input_ref_images[0])
img = resize_and_centercrop_image(img, height, width)
input_ref_images[0] = convert_tensor_to_image(img)
else:
height, width = calculate_new_dimensions(height, width, h, w, fit_into_canvas)
input_ref_images[0] = input_ref_images[0].resize((width, height), resample=Image.Resampling.LANCZOS)
first_ref = 1
for i in range(first_ref,len(input_ref_images)):
w, h = input_ref_images[i].size
if crop:
img = convert_image_to_tensor(input_ref_images[i])
img = resize_and_centercrop_image(img, int(height*image_refs_relative_size/100), int(width*image_refs_relative_size/100))
input_ref_images[i] = convert_tensor_to_image(img)
else:
image_height, image_width = calculate_new_dimensions(int(height*image_refs_relative_size/100), int(width*image_refs_relative_size/100), h, w, fit_into_canvas)
input_ref_images[0] = input_ref_images[0].resize((image_width, image_height), resample=Image.Resampling.LANCZOS)
input_ref_images[i] = input_ref_images[i].resize((image_width, image_height), resample=Image.Resampling.LANCZOS)
else:
input_ref_images = None

View File

@ -153,7 +153,6 @@ def prepare_kontext(
# Kontext is trained on specific resolutions, using one of them is recommended
_, width, height = min((abs(aspect_ratio - w / h), w, h) for w, h in PREFERED_KONTEXT_RESOLUTIONS)
width = 2 * int(width / 16)
height = 2 * int(height / 16)

View File

@ -15,6 +15,7 @@ class family_handler():
("Default", "default"),
("Lightning", "lightning")],
"guidance_max_phases" : 1,
"lock_image_refs_ratios": True,
}

View File

@ -117,6 +117,8 @@ class family_handler():
extra_model_def["no_background_removal"] = True
# extra_model_def["at_least_one_image_ref_needed"] = True
if base_model_type in ["standin"] or vace_class:
extra_model_def["lock_image_refs_ratios"] = True
# if base_model_type in ["phantom_1.3B", "phantom_14B"]:
# extra_model_def["one_image_ref_needed"] = True

View File

@ -18,11 +18,11 @@ import os
import tempfile
import subprocess
import json
from functools import lru_cache
from PIL import Image
video_info_cache = []
def seed_everything(seed: int):
random.seed(seed)
np.random.seed(seed)
@ -77,7 +77,9 @@ def truncate_for_filesystem(s, max_bytes=255):
else: r = m - 1
return s[:l]
@lru_cache(maxsize=100)
def get_video_info(video_path):
global video_info_cache
import cv2
cap = cv2.VideoCapture(video_path)

33
wgp.py
View File

@ -60,8 +60,8 @@ AUTOSAVE_FILENAME = "queue.zip"
PROMPT_VARS_MAX = 10
target_mmgp_version = "3.5.12"
WanGP_version = "8.32"
settings_version = 2.28
WanGP_version = "8.33"
settings_version = 2.29
max_source_video_frames = 3000
prompt_enhancer_image_caption_model, prompt_enhancer_image_caption_processor, prompt_enhancer_llm_model, prompt_enhancer_llm_tokenizer = None, None, None, None
@ -3313,6 +3313,7 @@ def select_video(state, input_file_list, event_data: gr.EventData):
if not all_letters(src, pos): return False
if neg is not None and any_letters(src, neg): return False
return True
image_outputs = configs.get("image_mode",0) == 1
map_video_prompt = {"V" : "Control Video", ("VA", "U") : "Mask Video", "I" : "Reference Images"}
map_image_prompt = {"V" : "Source Video", "L" : "Last Video", "S" : "Start Image", "E" : "End Image"}
map_audio_prompt = {"A" : "Audio Source", "B" : "Audio Source #2"}
@ -3364,6 +3365,7 @@ def select_video(state, input_file_list, event_data: gr.EventData):
if multiple_submodels:
video_guidance_scale += f" + Model Switch at {video_switch_threshold if video_model_switch_phase ==1 else video_switch_threshold2}"
video_flow_shift = configs.get("flow_shift", None)
if image_outputs: video_flow_shift = None
video_video_guide_outpainting = configs.get("video_guide_outpainting", "")
video_outpainting = ""
if len(video_video_guide_outpainting) > 0 and not video_video_guide_outpainting.startswith("#") \
@ -4545,7 +4547,8 @@ def generate_video(
send_cmd("progress", [0, get_latest_status(state, "Removing Images References Background")])
os.environ["U2NET_HOME"] = os.path.join(os.getcwd(), "ckpts", "rembg")
from shared.utils.utils import resize_and_remove_background
image_refs[nb_frames_positions:] = resize_and_remove_background(image_refs[nb_frames_positions:] , width, height, remove_background_images_ref > 0, any_background_ref, fit_into_canvas= not (any_background_ref or vace or standin) ) # no fit for vace ref images as it is done later
# keep image ratios if there is a background image ref (we will let the model preprocessor decide what to do) but remove bg if requested
image_refs[nb_frames_positions:] = resize_and_remove_background(image_refs[nb_frames_positions:] , width, height, remove_background_images_ref > 0, any_background_ref, fit_into_canvas= not (any_background_ref or model_def.get("lock_image_refs_ratios", False)) ) # no fit for vace ref images as it is done later
update_task_thumbnails(task, locals())
send_cmd("output")
joint_pass = boost ==1 #and profile != 1 and profile != 3
@ -5912,6 +5915,8 @@ def prepare_inputs_dict(target, inputs, model_type = None, model_filename = None
if target == "settings":
return inputs
image_outputs = inputs.get("image_mode",0) == 1
pop=[]
if "force_fps" in inputs and len(inputs["force_fps"])== 0:
pop += ["force_fps"]
@ -5977,7 +5982,7 @@ def prepare_inputs_dict(target, inputs, model_type = None, model_filename = None
if guidance_max_phases < 3 or guidance_phases < 3:
pop += ["guidance3_scale", "switch_threshold2", "model_switch_phase"]
if ltxv:
if ltxv or image_outputs:
pop += ["flow_shift"]
if model_def.get("no_negative_prompt", False) :
@ -6876,11 +6881,15 @@ def detect_auto_save_form(state, evt:gr.SelectData):
return gr.update()
def compute_video_length_label(fps, current_video_length):
if fps is None:
return f"Number of frames"
else:
return f"Number of frames ({fps} frames = 1s), current duration: {(current_video_length / fps):.1f}s",
def refresh_video_length_label(state, current_video_length):
fps = get_model_fps(get_base_model_type(state["model_type"]))
return gr.update(label= compute_video_length_label(fps, current_video_length))
def refresh_video_length_label(state, current_video_length, force_fps, video_guide, video_source):
base_model_type = get_base_model_type(state["model_type"])
computed_fps = get_computed_fps(force_fps, base_model_type , video_guide, video_source )
return gr.update(label= compute_video_length_label(computed_fps, current_video_length))
def generate_video_tab(update_form = False, state_dict = None, ui_defaults = None, model_family = None, model_choice = None, header = None, main = None, main_tabs= None):
global inputs_names #, advanced
@ -7469,8 +7478,9 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non
current_video_length = ui_defaults.get("video_length", 81 if get_model_family(base_model_type)=="wan" else 97)
computed_fps = get_computed_fps(ui_defaults.get("force_fps",""), base_model_type , video_guide, video_source )
video_length = gr.Slider(min_frames, get_max_frames(737 if test_any_sliding_window(base_model_type) else 337), value=current_video_length,
step=frames_step, label=compute_video_length_label(fps, current_video_length) , visible = True, interactive= True)
step=frames_step, label=compute_video_length_label(computed_fps, current_video_length) , visible = True, interactive= True)
with gr.Row(visible = not lock_inference_steps) as inference_steps_row:
num_inference_steps = gr.Slider(1, 100, value=ui_defaults.get("num_inference_steps",30), step=1, label="Number of Inference Steps", visible = True)
@ -7643,7 +7653,7 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non
with gr.Column(visible = (t2v or vace) and not fantasy) as audio_prompt_type_remux_row:
gr.Markdown("<B>You may transfer the exising audio tracks of a Control Video</B>")
gr.Markdown("<B>You may transfer the existing audio tracks of a Control Video</B>")
audio_prompt_type_remux = gr.Dropdown(
choices=[
("No Remux", ""),
@ -7955,7 +7965,7 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non
extra_inputs = prompt_vars + [wizard_prompt, wizard_variables_var, wizard_prompt_activated_var, video_prompt_column, image_prompt_column,
prompt_column_advanced, prompt_column_wizard_vars, prompt_column_wizard, lset_name, save_lset_prompt_drop, advanced_row, speed_tab, audio_tab, mmaudio_col, quality_tab,
sliding_window_tab, misc_tab, prompt_enhancer_row, inference_steps_row, skip_layer_guidance_row, audio_guide_row, RIFLEx_setting_col,
video_prompt_type_video_guide, video_prompt_type_video_guide_alt, video_prompt_type_video_mask, video_prompt_type_image_refs, apg_col, audio_prompt_type_sources, audio_prompt_type_remux_row,
video_prompt_type_video_guide, video_prompt_type_video_guide_alt, video_prompt_type_video_mask, video_prompt_type_image_refs, apg_col, audio_prompt_type_sources, audio_prompt_type_remux, audio_prompt_type_remux_row,
video_guide_outpainting_col,video_guide_outpainting_top, video_guide_outpainting_bottom, video_guide_outpainting_left, video_guide_outpainting_right,
video_guide_outpainting_checkbox, video_guide_outpainting_row, show_advanced, video_info_to_control_video_btn, video_info_to_video_source_btn, sample_solver_row,
video_buttons_row, image_buttons_row, video_postprocessing_tab, audio_remuxing_tab, PP_MMAudio_row, PP_custom_audio_row,
@ -7975,7 +7985,8 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non
resolution.change(fn=record_last_resolution, inputs=[state, resolution])
video_length.release(fn=refresh_video_length_label, inputs=[state, video_length ], outputs = video_length, trigger_mode="always_last" )
# video_length.release(fn=refresh_video_length_label, inputs=[state, video_length ], outputs = video_length, trigger_mode="always_last" )
gr.on(triggers=[video_length.release, force_fps.change, video_guide.change, video_source.change], fn=refresh_video_length_label, inputs=[state, video_length, force_fps, video_guide, video_source] , outputs = video_length, trigger_mode="always_last" )
guidance_phases.change(fn=change_guidance_phases, inputs= [state, guidance_phases], outputs =[model_switch_phase, guidance_phases_row, switch_threshold, switch_threshold2, guidance2_scale, guidance3_scale ])
audio_prompt_type_remux.change(fn=refresh_audio_prompt_type_remux, inputs=[state, audio_prompt_type, audio_prompt_type_remux], outputs=[audio_prompt_type])
audio_prompt_type_sources.change(fn=refresh_audio_prompt_type_sources, inputs=[state, audio_prompt_type, audio_prompt_type_sources], outputs=[audio_prompt_type, audio_guide, audio_guide2, speakers_locations_row])