Take me to outer space

This commit is contained in:
DeepBeepMeep 2025-09-05 14:17:21 +02:00
parent 13b001d4ea
commit 8859e816d0
7 changed files with 254 additions and 203 deletions

View File

@ -20,6 +20,23 @@ WanGP supports the Wan (and derived models), Hunyuan Video and LTV Video models
**Follow DeepBeepMeep on Twitter/X to get the Latest News**: https://x.com/deepbeepmeep
## 🔥 Latest Updates :
### September 5 2025: WanGP v8.4 - Take me to Outer Space
You have probably seen these short AI generated movies created using *Nano Banana* and the *First Frame - Last Frame* feature of *Kling 2.0*. The idea is to generate an image, modify a part of it with Nano Banana and give the these two images to Kling that will generate the Video between these two images, use now the previous Last Frame as the new First Frame, rinse and repeat and you get a full movie.
I have made it easier to do just that with *Qwen Edit* and *Wan*:
- **End Frames can now be combined with Continue a Video** (and not just a Start Frame)
- **Multiple End Frames can be inputed**, each End Frame will be used for a different Sliding Window
You can plan in advance all your shots (one shot = one Sliding Window) : I recommend using Wan 2.2 Image to Image with multiple End Frames (one for each shot / Sliding Window), and a different Text Prompt for each shot / Sliding Winow (remember to enable *Sliding Windows/Text Prompts Will be used for a new Sliding Window of the same Video Generation*)
The results can quite be impressive. However, Wan 2.1 & 2.2 Image 2 Image are restricted to a single overlap frame when using Slide Windows, which means only one frame is reeused for the motion. This may be unsufficient if you are trying to connect two shots with fast movement.
This is where *InfinitTalk* comes into play. Beside being one best models to generate animated audio driven avatars, InfiniteTalk uses internally more one than motion frames. It is quite good to maintain the motions between two shots. I have tweaked InfinitTalk so that **its motion engine can be used even if no audio is provided**.
So here is how to use InfiniteTalk: enable *Sliding Windows/Text Prompts Will be used for a new Sliding Window of the same Video Generation*), and if you continue an existing Video *Misc/Override Frames per Second" should be set to "Source Video*. Each Reference Frame inputed will play the same role as the End Frame except it wont be exactly an End Frame (it will correspond more to a middle frame, the actual End Frame will differ but will be close)
You will find below a 33s movie I have created using these two methods. Quality could be much better as I havent tuned at all the settings (I couldn't bother, I used 10 steps generation without Loras Accelerators for most of the gens).
### September 2 2025: WanGP v8.31 - At last the pain stops
- This single new feature should give you the strength to face all the potential bugs of this new release:

View File

@ -56,6 +56,10 @@ class family_handler():
if base_model_type in ["hunyuan_custom", "hunyuan_custom_edit", "hunyuan_custom_audio", "hunyuan_avatar"]:
extra_model_def["one_image_ref_needed"] = True
if base_model_type in ["hunyuan_i2v"]:
extra_model_def["image_prompt_types_allowed"] = "S"
return extra_model_def
@staticmethod

View File

@ -24,6 +24,7 @@ class family_handler():
extra_model_def["frames_minimum"] = 17
extra_model_def["frames_steps"] = 8
extra_model_def["sliding_window"] = True
extra_model_def["image_prompt_types_allowed"] = "TSEV"
return extra_model_def

View File

@ -537,7 +537,6 @@ class WanAny2V:
image_ref = input_frames[:, 0]
if input_video is None: input_video = input_frames[:, 0:1]
new_shot = "Q" in video_prompt_type
denoising_strength = 0.5
else:
if pre_video_frame is None:
new_shot = True
@ -556,54 +555,39 @@ class WanAny2V:
_ , preframes_count, height, width = input_video.shape
input_video = input_video.to(device=self.device).to(dtype= self.VAE_dtype)
if infinitetalk:
image_for_clip = image_ref.to(input_video)
image_start = image_ref.to(input_video)
control_pre_frames_count = 1
control_video = image_for_clip.unsqueeze(1)
control_video = image_start.unsqueeze(1)
else:
image_for_clip = input_video[:, -1]
image_start = input_video[:, -1]
control_pre_frames_count = preframes_count
control_video = input_video
lat_h, lat_w = height // self.vae_stride[1], width // self.vae_stride[2]
if hasattr(self, "clip"):
clip_image_size = self.clip.model.image_size
clip_image = resize_lanczos(image_for_clip, clip_image_size, clip_image_size)[:, None, :, :]
clip_context = self.clip.visual([clip_image]) if model_type != "flf2v_720p" else self.clip.visual([clip_image , clip_image ])
clip_image = None
else:
clip_context = None
enc = torch.concat( [control_video, torch.zeros( (3, frame_num-control_pre_frames_count, height, width),
device=self.device, dtype= self.VAE_dtype)],
dim = 1).to(self.device)
color_reference_frame = image_for_clip.unsqueeze(1).clone()
color_reference_frame = image_start.unsqueeze(1).clone()
else:
preframes_count = control_pre_frames_count = 1
height, width = image_start.shape[1:]
control_video = image_start.unsqueeze(1).to(self.device)
color_reference_frame = control_video.clone()
any_end_frame = image_end is not None
add_frames_for_end_image = any_end_frame and model_type == "i2v"
if any_end_frame:
color_correction_strength = 0 #disable color correction as transition frames between shots may have a complete different color level than the colors of the new shot
if add_frames_for_end_image:
frame_num +=1
lat_frames = int((frame_num - 2) // self.vae_stride[0] + 2)
trim_frames = 1
height, width = image_start.shape[1:]
lat_h, lat_w = height // self.vae_stride[1], width // self.vae_stride[2]
lat_h = round(
height // self.vae_stride[1] //
self.patch_size[1] * self.patch_size[1])
lat_w = round(
width // self.vae_stride[2] //
self.patch_size[2] * self.patch_size[2])
height = lat_h * self.vae_stride[1]
width = lat_w * self.vae_stride[2]
image_start_frame = image_start.unsqueeze(1).to(self.device)
color_reference_frame = image_start_frame.clone()
if image_end is not None:
img_end_frame = image_end.unsqueeze(1).to(self.device)
if hasattr(self, "clip"):
clip_image_size = self.clip.model.image_size
image_start = resize_lanczos(image_start, clip_image_size, clip_image_size)
if image_end is not None: image_end = resize_lanczos(image_end, clip_image_size, clip_image_size)
image_end = resize_lanczos(image_end, clip_image_size, clip_image_size) if image_end is not None else image_start
if model_type == "flf2v_720p":
clip_context = self.clip.visual([image_start[:, None, :, :], image_end[:, None, :, :] if image_end is not None else image_start[:, None, :, :]])
else:
@ -613,17 +597,17 @@ class WanAny2V:
if any_end_frame:
enc= torch.concat([
image_start_frame,
torch.zeros( (3, frame_num-2, height, width), device=self.device, dtype= self.VAE_dtype),
control_video,
torch.zeros( (3, frame_num-control_pre_frames_count-1, height, width), device=self.device, dtype= self.VAE_dtype),
img_end_frame,
], dim=1).to(self.device)
else:
enc= torch.concat([
image_start_frame,
torch.zeros( (3, frame_num-1, height, width), device=self.device, dtype= self.VAE_dtype)
control_video,
torch.zeros( (3, frame_num-control_pre_frames_count, height, width), device=self.device, dtype= self.VAE_dtype)
], dim=1).to(self.device)
image_start = image_end = image_start_frame = img_end_frame = image_for_clip = image_ref = None
image_start = image_end = img_end_frame = image_ref = control_video = None
msk = torch.ones(1, frame_num, lat_h, lat_w, device=self.device)
if any_end_frame:
@ -657,12 +641,11 @@ class WanAny2V:
# Recam Master
if recam:
# should be be in fact in input_frames since it is control video not a video to be extended
target_camera = model_mode
height,width = input_video.shape[-2:]
input_video = input_video.to(dtype=self.dtype , device=self.device)
source_latents = self.vae.encode([input_video])[0].unsqueeze(0) #.to(dtype=self.dtype, device=self.device)
del input_video
height,width = input_frames.shape[-2:]
input_frames = input_frames.to(dtype=self.dtype , device=self.device)
source_latents = self.vae.encode([input_frames])[0].unsqueeze(0) #.to(dtype=self.dtype, device=self.device)
del input_frames
# Process target camera (recammaster)
from shared.utils.cammmaster_tools import get_camera_embedding
cam_emb = get_camera_embedding(target_camera)
@ -754,7 +737,9 @@ class WanAny2V:
else:
target_shape = (self.vae.model.z_dim, lat_frames + ref_images_count, height // self.vae_stride[1], width // self.vae_stride[2])
if multitalk and audio_proj != None:
if multitalk:
if audio_proj is None:
audio_proj = [ torch.zeros( (1, 1, 5, 12, 768 ), dtype=self.dtype, device=self.device), torch.zeros( (1, (frame_num - 1) // 4, 8, 12, 768 ), dtype=self.dtype, device=self.device) ]
from .multitalk.multitalk import get_target_masks
audio_proj = [audio.to(self.dtype) for audio in audio_proj]
human_no = len(audio_proj[0])

View File

@ -26,6 +26,18 @@ class family_handler():
extra_model_def["tea_cache"] = True
extra_model_def["guidance_max_phases"] = 1
extra_model_def["model_modes"] = {
"choices": [
("Synchronous", 0),
("Asynchronous (better quality but around 50% extra steps added)", 5),
],
"default": 0,
"label" : "Generation Type"
}
extra_model_def["image_prompt_types_allowed"] = "TSEV"
return extra_model_def
@staticmethod

View File

@ -5,6 +5,9 @@ import gradio as gr
def test_class_i2v(base_model_type):
return base_model_type in ["i2v", "i2v_2_2", "fun_inp_1.3B", "fun_inp", "flf2v_720p", "fantasy", "multitalk", "infinitetalk", "i2v_2_2_multitalk" ]
def text_oneframe_overlap(base_model_type):
return test_class_i2v(base_model_type) and not test_multitalk(base_model_type)
def test_class_1_3B(base_model_type):
return base_model_type in [ "vace_1.3B", "t2v_1.3B", "recam_1.3B","phantom_1.3B","fun_inp_1.3B"]
@ -120,6 +123,37 @@ class family_handler():
if base_model_type in ["standin"] or vace_class:
extra_model_def["lock_image_refs_ratios"] = True
if base_model_type in ["recam_1.3B"]:
extra_model_def["keep_frames_video_guide_not_supported"] = True
extra_model_def["model_modes"] = {
"choices": [
("Pan Right", 1),
("Pan Left", 2),
("Tilt Up", 3),
("Tilt Down", 4),
("Zoom In", 5),
("Zoom Out", 6),
("Translate Up (with rotation)", 7),
("Translate Down (with rotation)", 8),
("Arc Left (with rotation)", 9),
("Arc Right (with rotation)", 10),
],
"default": 1,
"label" : "Camera Movement Type"
}
if vace_class or base_model_type in ["infinitetalk"]:
image_prompt_types_allowed = "TVL"
elif base_model_type in ["ti2v_2_2"]:
image_prompt_types_allowed = "TSEVL"
elif i2v:
image_prompt_types_allowed = "SEVL"
else:
image_prompt_types_allowed = ""
extra_model_def["image_prompt_types_allowed"] = image_prompt_types_allowed
if text_oneframe_overlap(base_model_type):
extra_model_def["sliding_window_defaults"] = { "overlap_min" : 1, "overlap_max" : 1, "overlap_step": 0, "overlap_default": 1}
# if base_model_type in ["phantom_1.3B", "phantom_14B"]:
# extra_model_def["one_image_ref_needed"] = True
@ -251,6 +285,17 @@ class family_handler():
video_prompt_type = video_prompt_type.replace("U", "RU")
ui_defaults["video_prompt_type"] = video_prompt_type
if settings_version < 2.31:
if base_model_type in "recam_1.3B":
video_prompt_type = ui_defaults.get("video_prompt_type", "")
if not "V" in video_prompt_type:
video_prompt_type += "UV"
ui_defaults["video_prompt_type"] = video_prompt_type
ui_defaults["image_prompt_type"] = ""
if text_oneframe_overlap(base_model_type):
ui_defaults["sliding_window_overlap"] = 1
@staticmethod
def update_default_settings(base_model_type, model_def, ui_defaults):
ui_defaults.update({
@ -309,6 +354,15 @@ class family_handler():
"image_prompt_type": "T",
})
if base_model_type in ["recam_1.3B"]:
ui_defaults.update({
"video_prompt_type": "UV",
})
if text_oneframe_overlap(base_model_type):
ui_defaults.update["sliding_window_overlap"] = 1
ui_defaults.update["color_correction_strength"]= 0
if test_multitalk(base_model_type):
ui_defaults["audio_guidance_scale"] = 4

244
wgp.py
View File

@ -60,8 +60,8 @@ AUTOSAVE_FILENAME = "queue.zip"
PROMPT_VARS_MAX = 10
target_mmgp_version = "3.6.0"
WanGP_version = "8.34"
settings_version = 2.29
WanGP_version = "8.4"
settings_version = 2.31
max_source_video_frames = 3000
prompt_enhancer_image_caption_model, prompt_enhancer_image_caption_processor, prompt_enhancer_llm_model, prompt_enhancer_llm_tokenizer = None, None, None, None
@ -347,7 +347,7 @@ def process_prompt_and_add_tasks(state, model_choice):
model_switch_phase = inputs["model_switch_phase"]
switch_threshold = inputs["switch_threshold"]
switch_threshold2 = inputs["switch_threshold2"]
multi_prompts_gen_type = inputs["multi_prompts_gen_type"]
if len(loras_multipliers) > 0:
_, _, errors = parse_loras_multipliers(loras_multipliers, len(activated_loras), num_inference_steps, nb_phases= guidance_phases)
@ -445,7 +445,7 @@ def process_prompt_and_add_tasks(state, model_choice):
if "I" in video_prompt_type:
if image_refs == None or len(image_refs) == 0:
gr.Info("You must provide at least one Refererence Image")
gr.Info("You must provide at least one Reference Image")
return
image_refs = clean_image_list(image_refs)
if image_refs == None :
@ -511,9 +511,14 @@ def process_prompt_and_add_tasks(state, model_choice):
if image_start == None :
gr.Info("Start Image should be an Image")
return
if multi_prompts_gen_type == 1 and len(image_start) > 1:
gr.Info("Only one Start Image is supported")
return
else:
image_start = None
if not any_letters(image_prompt_type, "SVL"):
image_prompt_type = image_prompt_type.replace("E", "")
if "E" in image_prompt_type:
if image_end == None or isinstance(image_end, list) and len(image_end) == 0:
gr.Info("You must provide an End Image")
@ -522,8 +527,13 @@ def process_prompt_and_add_tasks(state, model_choice):
if image_end == None :
gr.Info("End Image should be an Image")
return
if len(image_start) != len(image_end):
gr.Info("The number of Start and End Images should be the same ")
if multi_prompts_gen_type == 0:
if video_source is not None:
if len(image_end)> 1:
gr.Info("If a Video is to be continued and the option 'Each Text Prompt Will create a new generated Video' is set, there can be only one End Image")
return
elif len(image_start or []) != len(image_end or []):
gr.Info("The number of Start and End Images should be the same when the option 'Each Text Prompt Will create a new generated Video'")
return
else:
image_end = None
@ -531,23 +541,21 @@ def process_prompt_and_add_tasks(state, model_choice):
if test_any_sliding_window(model_type) and image_mode == 0:
if video_length > sliding_window_size:
full_video_length = video_length if video_source is None else video_length + sliding_window_overlap
full_video_length = video_length if video_source is None else video_length + sliding_window_overlap -1
extra = "" if full_video_length == video_length else f" including {sliding_window_overlap} added for Video Continuation"
no_windows = compute_sliding_window_no(full_video_length, sliding_window_size, sliding_window_discard_last_frames, sliding_window_overlap)
gr.Info(f"The Number of Frames to generate ({video_length}{extra}) is greater than the Sliding Window Size ({sliding_window_size}), {no_windows} Windows will be generated")
if "recam" in model_filename:
if video_source == None:
gr.Info("You must provide a Source Video")
if video_guide == None:
gr.Info("You must provide a Control Video")
return
frames = get_resampled_video(video_source, 0, 81, get_computed_fps(force_fps, model_type , video_guide, video_source ))
computed_fps = get_computed_fps(force_fps, model_type , video_guide, video_source )
frames = get_resampled_video(video_guide, 0, 81, computed_fps)
if len(frames)<81:
gr.Info("Recammaster source video should be at least 81 frames once the resampling at 16 fps has been done")
gr.Info(f"Recammaster Control video should be at least 81 frames once the resampling at {computed_fps} fps has been done")
return
if "hunyuan_custom_custom_edit" in model_filename:
if len(keep_frames_video_guide) > 0:
gr.Info("Filtering Frames with this model is not supported")
@ -558,13 +566,13 @@ def process_prompt_and_add_tasks(state, model_choice):
gr.Info("Only one Start Image must be provided if multiple prompts are used for different windows")
return
if image_end != None and len(image_end) > 1:
gr.Info("Only one End Image must be provided if multiple prompts are used for different windows")
return
# if image_end != None and len(image_end) > 1:
# gr.Info("Only one End Image must be provided if multiple prompts are used for different windows")
# return
override_inputs = {
"image_start": image_start[0] if image_start !=None and len(image_start) > 0 else None,
"image_end": image_end[0] if image_end !=None and len(image_end) > 0 else None,
"image_end": image_end, #[0] if image_end !=None and len(image_end) > 0 else None,
"image_refs": image_refs,
"audio_guide": audio_guide,
"audio_guide2": audio_guide2,
@ -640,19 +648,21 @@ def process_prompt_and_add_tasks(state, model_choice):
override_inputs["prompt"] = single_prompt
inputs.update(override_inputs)
add_video_task(**inputs)
new_prompts_count = len(prompts)
else:
new_prompts_count = 1
override_inputs["prompt"] = "\n".join(prompts)
inputs.update(override_inputs)
add_video_task(**inputs)
gen["prompts_max"] = len(prompts) + gen.get("prompts_max",0)
gen["prompts_max"] = new_prompts_count + gen.get("prompts_max",0)
state["validate_success"] = 1
queue= gen.get("queue", [])
return update_queue_data(queue)
def get_preview_images(inputs):
inputs_to_query = ["image_start", "image_end", "video_source", "video_guide", "image_guide", "video_mask", "image_mask", "image_refs" ]
labels = ["Start Image", "End Image", "Video Source", "Video Guide", "Image Guide", "Video Mask", "Image Mask", "Image Reference"]
inputs_to_query = ["image_start", "video_source", "image_end", "video_guide", "image_guide", "video_mask", "image_mask", "image_refs" ]
labels = ["Start Image", "Video Source", "End Image", "Video Guide", "Image Guide", "Video Mask", "Image Mask", "Image Reference"]
start_image_data = None
start_image_labels = []
end_image_data = None
@ -3454,6 +3464,8 @@ def convert_image(image):
from PIL import ImageOps
from typing import cast
if isinstance(image, str):
image = Image.open(image)
image = image.convert('RGB')
return cast(Image, ImageOps.exif_transpose(image))
@ -4506,7 +4518,7 @@ def generate_video(
if test_any_sliding_window(model_type) :
if video_source is not None:
current_video_length += sliding_window_overlap
current_video_length += sliding_window_overlap - 1
sliding_window = current_video_length > sliding_window_size
reuse_frames = min(sliding_window_size - 4, sliding_window_overlap)
else:
@ -4690,7 +4702,6 @@ def generate_video(
while not abort:
enable_RIFLEx = RIFLEx_setting == 0 and current_video_length > (6* get_model_fps(base_model_type)+1) or RIFLEx_setting == 1
if sliding_window:
prompt = prompts[window_no] if window_no < len(prompts) else prompts[-1]
new_extra_windows = gen.get("extra_windows",0)
gen["extra_windows"] = 0
@ -4722,15 +4733,13 @@ def generate_video(
image_start_tensor = image_start.resize((new_width, new_height), resample=Image.Resampling.LANCZOS)
image_start_tensor = convert_image_to_tensor(image_start_tensor)
pre_video_guide = prefix_video = image_start_tensor.unsqueeze(1)
if image_end is not None:
image_end_tensor = image_end.resize((new_width, new_height), resample=Image.Resampling.LANCZOS)
image_end_tensor = convert_image_to_tensor(image_end_tensor)
else:
if "L" in image_prompt_type:
refresh_preview["video_source"] = get_video_frame(video_source, 0)
prefix_video = preprocess_video(width=width, height=height,video_in=video_source, max_frames= parsed_keep_frames_video_source , start_frame = 0, fit_canvas= sample_fit_canvas, target_fps = fps, block_size = block_size )
prefix_video = prefix_video.permute(3, 0, 1, 2)
prefix_video = prefix_video.float().div_(127.5).sub_(1.) # c, f, h, w
new_height, new_width = prefix_video.shape[-2:]
pre_video_guide = prefix_video[:, -reuse_frames:]
pre_video_frame = convert_tensor_to_image(prefix_video[:, -1])
source_video_overlap_frames_count = pre_video_guide.shape[1]
@ -4739,6 +4748,13 @@ def generate_video(
image_size = pre_video_guide.shape[-2:]
sample_fit_canvas = None
guide_start_frame = prefix_video.shape[1]
if image_end is not None:
image_end_list= image_end if isinstance(image_end, list) else [image_end]
if len(image_end_list) >= window_no:
new_height, new_width = image_size
image_end_tensor =image_end_list[window_no-1].resize((new_width, new_height), resample=Image.Resampling.LANCZOS)
image_end_tensor = convert_image_to_tensor(image_end_tensor)
image_end_list= None
window_start_frame = guide_start_frame - (reuse_frames if window_no > 1 else source_video_overlap_frames_count)
guide_end_frame = guide_start_frame + current_video_length - (source_video_overlap_frames_count if window_no == 1 else reuse_frames)
@ -4797,7 +4813,7 @@ def generate_video(
image_size = src_video.shape[-2:]
sample_fit_canvas = None
elif "G" in video_prompt_type: # video to video
else: # video to video
video_guide_processed = preprocess_video(width = image_size[1], height=image_size[0], video_in=video_guide, max_frames= len(keep_frames_parsed), start_frame = aligned_guide_start_frame, fit_canvas= sample_fit_canvas, target_fps = fps)
if video_guide_processed is None:
src_video = pre_video_guide
@ -5298,7 +5314,7 @@ def process_tasks(state):
while True:
with gen_lock:
process_status = gen.get("process_status", None)
if process_status is None:
if process_status is None or process_status == "process:main":
gen["process_status"] = "process:main"
break
time.sleep(1)
@ -6570,11 +6586,13 @@ def any_letters(source_str, letters):
return True
return False
def filter_letters(source_str, letters):
def filter_letters(source_str, letters, default= ""):
ret = ""
for letter in letters:
if letter in source_str:
ret += letter
if len(ret) == 0:
return default
return ret
def add_to_sequence(source_str, letters):
@ -6601,9 +6619,18 @@ def refresh_audio_prompt_type_sources(state, audio_prompt_type, audio_prompt_typ
audio_prompt_type = add_to_sequence(audio_prompt_type, audio_prompt_type_sources)
return audio_prompt_type, gr.update(visible = "A" in audio_prompt_type), gr.update(visible = "B" in audio_prompt_type), gr.update(visible = ("B" in audio_prompt_type or "X" in audio_prompt_type))
def refresh_image_prompt_type(state, image_prompt_type):
any_video_source = len(filter_letters(image_prompt_type, "VLG"))>0
return gr.update(visible = "S" in image_prompt_type ), gr.update(visible = "E" in image_prompt_type ), gr.update(visible = "V" in image_prompt_type) , gr.update(visible = any_video_source)
def refresh_image_prompt_type_radio(state, image_prompt_type, image_prompt_type_radio):
image_prompt_type = del_in_sequence(image_prompt_type, "VLTS")
image_prompt_type = add_to_sequence(image_prompt_type, image_prompt_type_radio)
any_video_source = len(filter_letters(image_prompt_type, "VL"))>0
end_visible = any_letters(image_prompt_type, "SVL")
return image_prompt_type, gr.update(visible = "S" in image_prompt_type ), gr.update(visible = end_visible and ("E" in image_prompt_type) ), gr.update(visible = "V" in image_prompt_type) , gr.update(visible = any_video_source), gr.update(visible = end_visible)
def refresh_image_prompt_type_endcheckbox(state, image_prompt_type, image_prompt_type_radio, end_checkbox):
image_prompt_type = del_in_sequence(image_prompt_type, "E")
if end_checkbox: image_prompt_type += "E"
image_prompt_type = add_to_sequence(image_prompt_type, image_prompt_type_radio)
return image_prompt_type, gr.update(visible = "E" in image_prompt_type )
def refresh_video_prompt_type_image_refs(state, video_prompt_type, video_prompt_type_image_refs):
model_type = state["model_type"]
@ -6680,9 +6707,12 @@ def get_prompt_labels(multi_prompts_gen_type, image_outputs = False):
new_line_text = "each new line of prompt will be used for a window" if multi_prompts_gen_type != 0 else "each new line of prompt will generate " + ("a new image" if image_outputs else "a new video")
return "Prompts (" + new_line_text + ", # lines = comments, ! lines = macros)", "Prompts (" + new_line_text + ", # lines = comments)"
def get_image_end_label(multi_prompts_gen_type):
return "Images as ending points for new Videos in the Generation Queue" if multi_prompts_gen_type == 0 else "Images as ending points for each new Window of the same Video Generation"
def refresh_prompt_labels(multi_prompts_gen_type, image_mode):
prompt_label, wizard_prompt_label = get_prompt_labels(multi_prompts_gen_type, image_mode == 1)
return gr.update(label=prompt_label), gr.update(label = wizard_prompt_label)
return gr.update(label=prompt_label), gr.update(label = wizard_prompt_label), gr.update(label=get_image_end_label(multi_prompts_gen_type))
def show_preview_column_modal(state, column_no):
column_no = int(column_no)
@ -7054,101 +7084,46 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non
with gr.Tab("Text to Image", id = "t2i", elem_classes="compact_tab"):
pass
with gr.Column(visible= test_class_i2v(model_type) or hunyuan_i2v or diffusion_forcing or ltxv or recammaster or vace or ti2v_2_2) as image_prompt_column:
if vace or infinitetalk:
image_prompt_types_allowed = model_def.get("image_prompt_types_allowed", "")
model_mode_choices = model_def.get("model_modes", None)
with gr.Column(visible= len(image_prompt_types_allowed)> 0 or model_mode_choices is not None) as image_prompt_column:
image_prompt_type_value= ui_defaults.get("image_prompt_type","")
image_prompt_type_value = "" if image_prompt_type_value == "S" else image_prompt_type_value
image_prompt_type = gr.Radio( [("New Video", ""),("Continue Video File", "V"),("Continue Last Video", "L")], value =image_prompt_type_value, label="Source Video", show_label= False, visible= not image_outputs , scale= 3)
image_start_row, image_start, image_start_extra = get_image_gallery(visible = False )
image_end_row, image_end, image_end_extra = get_image_gallery(visible = False )
video_source = gr.Video(label= "Video Source", height = gallery_height, visible = "V" in image_prompt_type_value, value= ui_defaults.get("video_source", None))
model_mode = gr.Dropdown(visible = False)
keep_frames_video_source = gr.Text(value=ui_defaults.get("keep_frames_video_source","") , visible= len(filter_letters(image_prompt_type_value, "VLG"))>0 , scale = 2, label= "Truncate Video beyond this number of resampled Frames (empty=Keep All, negative truncates from End)" )
any_video_source = True
elif diffusion_forcing or ltxv or ti2v_2_2:
image_prompt_type_value= ui_defaults.get("image_prompt_type","T")
# image_prompt_type = gr.Radio( [("Start Video with Image", "S"),("Start and End Video with Images", "SE"), ("Continue Video", "V"),("Text Prompt Only", "T")], value =image_prompt_type_value, label="Location", show_label= False, visible= True, scale= 3)
image_prompt_type_choices = [("Text Prompt Only", "T"),("Start Video with Image", "S")]
if ltxv:
image_prompt_type_choices += [("Use both a Start and an End Image", "SE")]
if sliding_window_enabled:
any_video_source = True
image_prompt_type_choices += [("Continue Video", "V")]
image_prompt_type = gr.Radio( image_prompt_type_choices, value =image_prompt_type_value, label="Location", show_label= False, visible= True , scale= 3)
image_start_row, image_start, image_start_extra = get_image_gallery(label= "Images as starting points for new videos", value = ui_defaults.get("image_start", None), visible= "S" in image_prompt_type_value )
image_end_row, image_end, image_end_extra = get_image_gallery(label= "Images as ending points for new videos", value = ui_defaults.get("image_end", None), visible= "E" in image_prompt_type_value )
video_source = gr.Video(label= "Video to Continue", height = gallery_height, visible= "V" in image_prompt_type_value, value= ui_defaults.get("video_source", None),)
if not diffusion_forcing:
model_mode = gr.Dropdown(
choices=[
], value=None,
visible= False
)
else:
model_mode = gr.Dropdown(
choices=[
("Synchronous", 0),
("Asynchronous (better quality but around 50% extra steps added)", 5),
],
value=ui_defaults.get("model_mode", 0),
label="Generation Type", scale = 3,
visible= True
)
keep_frames_video_source = gr.Text(value=ui_defaults.get("keep_frames_video_source","") , visible= "V" in image_prompt_type_value, scale = 2, label= "Truncate Video beyond this number of Frames of Video (empty=Keep All)" )
elif recammaster:
image_prompt_type = gr.Radio(choices=[("Source Video", "V")], value="V")
image_start_row, image_start, image_start_extra = get_image_gallery(visible = False )
image_end_row, image_end, image_end_extra = get_image_gallery(visible = False )
video_source = gr.Video(label= "Video Source", height = gallery_height, visible = True, value= ui_defaults.get("video_source", None),)
model_mode = gr.Dropdown(
choices=[
("Pan Right", 1),
("Pan Left", 2),
("Tilt Up", 3),
("Tilt Down", 4),
("Zoom In", 5),
("Zoom Out", 6),
("Translate Up (with rotation)", 7),
("Translate Down (with rotation)", 8),
("Arc Left (with rotation)", 9),
("Arc Right (with rotation)", 10),
],
value=ui_defaults.get("model_mode", 1),
label="Camera Movement Type", scale = 3,
visible= True
)
keep_frames_video_source = gr.Text(visible=False)
else:
if test_class_i2v(model_type) or hunyuan_i2v:
# image_prompt_type_value= ui_defaults.get("image_prompt_type","SE" if flf2v else "S" )
image_prompt_type_value= ui_defaults.get("image_prompt_type","S" )
image_prompt_type_choices = [("Start Video with Image", "S")]
image_prompt_type_choices += [("Use both a Start and an End Image", "SE")]
if not hunyuan_i2v:
any_video_source = True
image_prompt_type_choices += [("Continue Video", "V")]
image_prompt_type = gr.Radio( image_prompt_type_choices, value =image_prompt_type_value, label="Location", show_label= False, visible= not hunyuan_i2v, scale= 3)
image_prompt_type = gr.Text(value= image_prompt_type_value, visible= False)
image_prompt_type_choices = []
if "T" in image_prompt_types_allowed:
image_prompt_type_choices += [("Text Prompt Only", "")]
any_start_image = True
if "S" in image_prompt_types_allowed:
image_prompt_type_choices += [("Start Video with Image", "S")]
any_start_image = True
if "V" in image_prompt_types_allowed:
any_video_source = True
image_prompt_type_choices += [("Continue Video", "V")]
if "L" in image_prompt_types_allowed:
any_video_source = True
image_prompt_type_choices += [("Continue Last Video", "L")]
with gr.Group(visible= len(image_prompt_types_allowed)>1) as image_prompt_type_group:
with gr.Row():
image_prompt_type_radio_allowed_values= filter_letters(image_prompt_types_allowed, "SVL")
if len(image_prompt_type_choices) > 0:
image_prompt_type_radio = gr.Radio( image_prompt_type_choices, value =filter_letters(image_prompt_type_value, image_prompt_type_radio_allowed_values, image_prompt_type_choices[0][1]), label="Location", show_label= False, visible= len(image_prompt_types_allowed)>1, scale= 3)
else:
image_prompt_type_radio = gr.Radio(choices=[("", "")], value="", visible= False)
if "E" in image_prompt_types_allowed:
image_prompt_type_endcheckbox = gr.Checkbox( value ="E" in image_prompt_type_value, label="End Image(s)", show_label= False, visible= any_letters(image_prompt_type_value, "SVL") and not image_outputs , scale= 1)
any_end_image = True
image_start_row, image_start, image_start_extra = get_image_gallery(label= "Images as starting points for new videos", value = ui_defaults.get("image_start", None), visible= "S" in image_prompt_type_value )
image_end_row, image_end, image_end_extra = get_image_gallery(label= "Images as ending points for new videos", value = ui_defaults.get("image_end", None), visible= "E" in image_prompt_type_value )
if hunyuan_i2v:
video_source = gr.Video(value=None, visible=False)
else:
image_prompt_type_endcheckbox = gr.Checkbox( value =False, show_label= False, visible= False , scale= 1)
image_start_row, image_start, image_start_extra = get_image_gallery(label= "Images as starting points for new Videos in the Generation Queue", value = ui_defaults.get("image_start", None), visible= "S" in image_prompt_type_value )
video_source = gr.Video(label= "Video to Continue", height = gallery_height, visible= "V" in image_prompt_type_value, value= ui_defaults.get("video_source", None),)
else:
image_prompt_type = gr.Radio(choices=[("", "")], value="")
image_start_row, image_start, image_start_extra = get_image_gallery(visible = False )
image_end_row, image_end, image_end_extra = get_image_gallery(visible = False )
video_source = gr.Video(value=None, visible=False)
image_end_row, image_end, image_end_extra = get_image_gallery(label= get_image_end_label(ui_defaults.get("multi_prompts_gen_type", 0)), value = ui_defaults.get("image_end", None), visible= any_letters(image_prompt_type_value, "SVL") and ("E" in image_prompt_type_value) )
if model_mode_choices is None:
model_mode = gr.Dropdown(value=None, visible=False)
keep_frames_video_source = gr.Text(visible=False)
else:
model_mode = gr.Dropdown(choices=model_mode_choices["choices"], value=ui_defaults.get("model_mode", model_mode_choices["default"]), label=model_mode_choices["label"], visible=True)
keep_frames_video_source = gr.Text(value=ui_defaults.get("keep_frames_video_source","") , visible= len(filter_letters(image_prompt_type_value, "VL"))>0 , scale = 2, label= "Truncate Video beyond this number of resampled Frames (empty=Keep All, negative truncates from End)" )
with gr.Column(visible= vace or phantom or hunyuan_video_custom or hunyuan_video_avatar or hunyuan_video_custom_edit or t2v or standin or ltxv or infinitetalk or flux and model_reference_image or qwen and model_reference_image) as video_prompt_column:
with gr.Column(visible= vace or phantom or hunyuan_video_custom or hunyuan_video_avatar or hunyuan_video_custom_edit or t2v or standin or ltxv or infinitetalk or recammaster or flux and model_reference_image or qwen and model_reference_image) as video_prompt_column:
video_prompt_type_value= ui_defaults.get("video_prompt_type","")
video_prompt_type = gr.Text(value= video_prompt_type_value, visible= False)
any_control_video = True
@ -7208,12 +7183,12 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non
value=filter_letters(video_prompt_type_value, "PDSLCMUV"),
label="Image to Image" if image_outputs else "Video to Video", scale = 3, visible= True, show_label= True,
)
elif infinitetalk:
video_prompt_type_video_guide = gr.Dropdown(value="", choices = [("","")], visible=False)
elif recammaster:
video_prompt_type_video_guide = gr.Dropdown(value="UV", choices = [("Control Video","UV")], visible=False)
else:
any_control_video = False
any_control_image = False
video_prompt_type_video_guide = gr.Dropdown(visible= False)
video_prompt_type_video_guide = gr.Dropdown(value="", choices = [("","")], visible=False)
if infinitetalk:
video_prompt_type_video_guide_alt = gr.Dropdown(
@ -7228,6 +7203,7 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non
value=filter_letters(video_prompt_type_value, "RGUVQKI"),
label="Video to Video", scale = 3, visible= True, show_label= False,
)
any_control_video = any_control_image = True
else:
video_prompt_type_video_guide_alt = gr.Dropdown(value="", choices = [("","")], visible=False)
@ -7761,7 +7737,7 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non
sliding_window_discard_last_frames = gr.Slider(0, 20, value=ui_defaults.get("sliding_window_discard_last_frames", 0), step=4, visible = False)
elif ltxv:
sliding_window_size = gr.Slider(41, get_max_frames(257), value=ui_defaults.get("sliding_window_size", 129), step=8, label="Sliding Window Size")
sliding_window_overlap = gr.Slider(9, 97, value=ui_defaults.get("sliding_window_overlap",9), step=8, label="Windows Frames Overlap (needed to maintain continuity between windows, a higher value will require more windows)")
sliding_window_overlap = gr.Slider(1, 97, value=ui_defaults.get("sliding_window_overlap",9), step=8, label="Windows Frames Overlap (needed to maintain continuity between windows, a higher value will require more windows)")
sliding_window_color_correction_strength = gr.Slider(0, 1, visible=False, value =0)
sliding_window_overlap_noise = gr.Slider(0, 100, value=ui_defaults.get("sliding_window_overlap_noise",20), step=1, label="Noise to be added to overlapped frames to reduce blur effect", visible = False)
sliding_window_discard_last_frames = gr.Slider(0, 20, value=ui_defaults.get("sliding_window_discard_last_frames", 0), step=8, label="Discard Last Frames of a Window (that may have bad quality)", visible = True)
@ -7772,8 +7748,9 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non
sliding_window_overlap_noise = gr.Slider(0, 150, value=ui_defaults.get("sliding_window_overlap_noise",20), step=1, label="Noise to be added to overlapped frames to reduce blur effect", visible = False)
sliding_window_discard_last_frames = gr.Slider(0, 20, value=ui_defaults.get("sliding_window_discard_last_frames", 0), step=4, label="Discard Last Frames of a Window (that may have bad quality)", visible = True)
else: # Vace, Multitalk
sliding_window_defaults = model_def.get("sliding_window_defaults", {})
sliding_window_size = gr.Slider(5, get_max_frames(257), value=ui_defaults.get("sliding_window_size", 129), step=4, label="Sliding Window Size")
sliding_window_overlap = gr.Slider(1, 97, value=ui_defaults.get("sliding_window_overlap",5), step=4, label="Windows Frames Overlap (needed to maintain continuity between windows, a higher value will require more windows)")
sliding_window_overlap = gr.Slider(sliding_window_defaults.get("overlap_min", 1), sliding_window_defaults.get("overlap_max", 97), value=ui_defaults.get("sliding_window_overlap",sliding_window_defaults.get("overlap_default", 5)), step=sliding_window_defaults.get("overlap_step", 4), label="Windows Frames Overlap (needed to maintain continuity between windows, a higher value will require more windows)")
sliding_window_color_correction_strength = gr.Slider(0, 1, value=ui_defaults.get("sliding_window_color_correction_strength",1), step=0.01, label="Color Correction Strength (match colors of new window with previous one, 0 = disabled)")
sliding_window_overlap_noise = gr.Slider(0, 150, value=ui_defaults.get("sliding_window_overlap_noise",20 if vace else 0), step=1, label="Noise to be added to overlapped frames to reduce blur effect" , visible = vace)
sliding_window_discard_last_frames = gr.Slider(0, 20, value=ui_defaults.get("sliding_window_discard_last_frames", 0), step=4, label="Discard Last Frames of a Window (that may have bad quality)", visible = True)
@ -7790,13 +7767,13 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non
multi_prompts_gen_type = gr.Dropdown(
choices=[
("Will create new generated Video", 0),
("Will create a new generated Video added to the Generation Queue", 0),
("Will be used for a new Sliding Window of the same Video Generation", 1),
],
value=ui_defaults.get("multi_prompts_gen_type",0),
visible=True,
scale = 1,
label="Text Prompts separated by a Carriage Return"
label="Images & Text Prompts separated by a Carriage Return" if (any_start_image or any_end_image) else "Text Prompts separated by a Carriage Return"
)
with gr.Tab("Misc.", visible = True) as misc_tab:
@ -7962,7 +7939,7 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non
hidden_countdown_state = gr.Number(value=-1, visible=False, elem_id="hidden_countdown_state_num")
single_hidden_trigger_btn = gr.Button("trigger_countdown", visible=False, elem_id="trigger_info_single_btn")
extra_inputs = prompt_vars + [wizard_prompt, wizard_variables_var, wizard_prompt_activated_var, video_prompt_column, image_prompt_column,
extra_inputs = prompt_vars + [wizard_prompt, wizard_variables_var, wizard_prompt_activated_var, video_prompt_column, image_prompt_column, image_prompt_type_group, image_prompt_type_radio, image_prompt_type_endcheckbox,
prompt_column_advanced, prompt_column_wizard_vars, prompt_column_wizard, lset_name, save_lset_prompt_drop, advanced_row, speed_tab, audio_tab, mmaudio_col, quality_tab,
sliding_window_tab, misc_tab, prompt_enhancer_row, inference_steps_row, skip_layer_guidance_row, audio_guide_row, RIFLEx_setting_col,
video_prompt_type_video_guide, video_prompt_type_video_guide_alt, video_prompt_type_video_mask, video_prompt_type_image_refs, apg_col, audio_prompt_type_sources, audio_prompt_type_remux, audio_prompt_type_remux_row,
@ -7981,23 +7958,24 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non
target_settings = gr.Text(value = "settings", interactive= False, visible= False)
last_choice = gr.Number(value =-1, interactive= False, visible= False)
resolution_group.input(fn=change_resolution_group, inputs=[state, resolution_group], outputs=[resolution])
resolution_group.input(fn=change_resolution_group, inputs=[state, resolution_group], outputs=[resolution], show_progress="hidden")
resolution.change(fn=record_last_resolution, inputs=[state, resolution])
# video_length.release(fn=refresh_video_length_label, inputs=[state, video_length ], outputs = video_length, trigger_mode="always_last" )
gr.on(triggers=[video_length.release, force_fps.change, video_guide.change, video_source.change], fn=refresh_video_length_label, inputs=[state, video_length, force_fps, video_guide, video_source] , outputs = video_length, trigger_mode="always_last" )
gr.on(triggers=[video_length.release, force_fps.change, video_guide.change, video_source.change], fn=refresh_video_length_label, inputs=[state, video_length, force_fps, video_guide, video_source] , outputs = video_length, trigger_mode="always_last", show_progress="hidden" )
guidance_phases.change(fn=change_guidance_phases, inputs= [state, guidance_phases], outputs =[model_switch_phase, guidance_phases_row, switch_threshold, switch_threshold2, guidance2_scale, guidance3_scale ])
audio_prompt_type_remux.change(fn=refresh_audio_prompt_type_remux, inputs=[state, audio_prompt_type, audio_prompt_type_remux], outputs=[audio_prompt_type])
audio_prompt_type_sources.change(fn=refresh_audio_prompt_type_sources, inputs=[state, audio_prompt_type, audio_prompt_type_sources], outputs=[audio_prompt_type, audio_guide, audio_guide2, speakers_locations_row])
image_prompt_type.change(fn=refresh_image_prompt_type, inputs=[state, image_prompt_type], outputs=[image_start_row, image_end_row, video_source, keep_frames_video_source] )
image_prompt_type_radio.change(fn=refresh_image_prompt_type_radio, inputs=[state, image_prompt_type, image_prompt_type_radio], outputs=[image_prompt_type, image_start_row, image_end_row, video_source, keep_frames_video_source, image_prompt_type_endcheckbox], show_progress="hidden" )
image_prompt_type_endcheckbox.change(fn=refresh_image_prompt_type_endcheckbox, inputs=[state, image_prompt_type, image_prompt_type_radio, image_prompt_type_endcheckbox], outputs=[image_prompt_type, image_end_row] )
# video_prompt_video_guide_trigger.change(fn=refresh_video_prompt_video_guide_trigger, inputs=[state, video_prompt_type, video_prompt_video_guide_trigger], outputs=[video_prompt_type, video_prompt_type_video_guide, video_guide, keep_frames_video_guide, denoising_strength, video_guide_outpainting_col, video_prompt_type_video_mask, video_mask, mask_expand])
video_prompt_type_image_refs.input(fn=refresh_video_prompt_type_image_refs, inputs = [state, video_prompt_type, video_prompt_type_image_refs], outputs = [video_prompt_type, image_refs_row, remove_background_images_ref, image_refs_relative_size, frames_positions,video_guide_outpainting_col])
video_prompt_type_video_guide.input(fn=refresh_video_prompt_type_video_guide, inputs = [state, video_prompt_type, video_prompt_type_video_guide, image_mode], outputs = [video_prompt_type, video_guide, image_guide, keep_frames_video_guide, denoising_strength, video_guide_outpainting_col, video_prompt_type_video_mask, video_mask, image_mask, mask_expand])
video_prompt_type_video_guide_alt.input(fn=refresh_video_prompt_type_video_guide_alt, inputs = [state, video_prompt_type, video_prompt_type_video_guide_alt], outputs = [video_prompt_type, video_guide, image_refs_row, denoising_strength ])
video_prompt_type_video_mask.input(fn=refresh_video_prompt_type_video_mask, inputs = [state, video_prompt_type, video_prompt_type_video_mask, image_mode], outputs = [video_prompt_type, video_mask, image_mask, mask_expand])
video_prompt_type_alignment.input(fn=refresh_video_prompt_type_alignment, inputs = [state, video_prompt_type, video_prompt_type_alignment], outputs = [video_prompt_type])
multi_prompts_gen_type.select(fn=refresh_prompt_labels, inputs=[multi_prompts_gen_type, image_mode], outputs=[prompt, wizard_prompt])
multi_prompts_gen_type.select(fn=refresh_prompt_labels, inputs=[multi_prompts_gen_type, image_mode], outputs=[prompt, wizard_prompt, image_end], show_progress="hidden")
video_guide_outpainting_top.input(fn=update_video_guide_outpainting, inputs=[video_guide_outpainting, video_guide_outpainting_top, gr.State(0)], outputs = [video_guide_outpainting], trigger_mode="multiple" )
video_guide_outpainting_bottom.input(fn=update_video_guide_outpainting, inputs=[video_guide_outpainting, video_guide_outpainting_bottom,gr.State(1)], outputs = [video_guide_outpainting], trigger_mode="multiple" )
video_guide_outpainting_left.input(fn=update_video_guide_outpainting, inputs=[video_guide_outpainting, video_guide_outpainting_left,gr.State(2)], outputs = [video_guide_outpainting], trigger_mode="multiple" )