fixed vace bugs

2026-02-07 03:07:47 +00:00 · 2025-05-05 23:58:21 +02:00 · 2025-05-05 23:58:21 +02:00 · 94d9b4aa4d
commit 94d9b4aa4d
parent cf02cc4004
7 changed files with 258 additions and 252 deletions
--- a/wan/diffusion_forcing.py
+++ b/wan/diffusion_forcing.py
@ -15,6 +15,7 @@ from .modules.model import WanModel
 from .modules.t5 import T5EncoderModel
 from .modules.vae import WanVAE
 from wan.modules.posemb_layers import get_rotary_pos_embed
 from wan.utils.utils import calculate_new_dimensions
 from .utils.fm_solvers import (FlowDPMSolverMultistepScheduler,
                               get_sampling_sigmas, retrieve_timesteps)
 from .utils.fm_solvers_unipc import FlowUniPCMultistepScheduler
@ -190,6 +191,7 @@ class DTT2V:
        input_video = None,
        height: int = 480,
        width: int = 832,
        fit_into_canvas = True,
        num_frames: int = 97,
        num_inference_steps: int = 50,
        shift: float = 1.0,
@ -221,15 +223,16 @@ class DTT2V:
        i2v_extra_kwrags = {}
        prefix_video = None
        predix_video_latent_length = 0
        if input_video != None:
            _ , _ , height, width  = input_video.shape
        elif image != None:
            image = image[0]
            frame_width, frame_height  = image.size
-            scale = min(height / frame_height, width /  frame_width)
+            height, width = calculate_new_dimensions(height, width, frame_height, frame_width, fit_into_canvas)
            height = (int(frame_height * scale) // 16) * 16
            width = (int(frame_width * scale) // 16) * 16
            image = np.array(image.resize((width, height))).transpose(2, 0, 1)
        latent_length = (num_frames - 1) // 4 + 1
        latent_height = height // 8
        latent_width = width // 8
--- a/wan/image2video.py
+++ b/wan/image2video.py
@ -25,7 +25,7 @@ from .utils.fm_solvers import (FlowDPMSolverMultistepScheduler,
                               get_sampling_sigmas, retrieve_timesteps)
 from .utils.fm_solvers_unipc import FlowUniPCMultistepScheduler
 from wan.modules.posemb_layers import get_rotary_pos_embed
-from wan.utils.utils import resize_lanczos
+from wan.utils.utils import resize_lanczos, calculate_new_dimensions
 def optimized_scale(positive_flat, negative_flat):
@ -120,7 +120,7 @@ class WanI2V:
        img2 = None,
        height =720,
        width = 1280,
-        max_area=720 * 1280,
+        fit_into_canvas = True,
        frame_num=81,
        shift=5.0,
        sample_solver='unipc',
@ -189,21 +189,15 @@ class WanI2V:
                frame_num +=1
                lat_frames = int((frame_num - 2) // self.vae_stride[0] + 2)
        h, w = img.shape[1:]
        # aspect_ratio = h / w
-        scale1  = min(height / h, width /  w)
+        h, w = calculate_new_dimensions(height, width, h, w, fit_into_canvas)
        scale2  = min(height / h, width /  w)
        scale = max(scale1, scale2) 
        new_height = int(h * scale) 
        new_width = int(w * scale) 
        lat_h = round(
-            new_height // self.vae_stride[1] //
+            h // self.vae_stride[1] //
            self.patch_size[1] * self.patch_size[1])
        lat_w = round(
-            new_width // self.vae_stride[2] //
+            w // self.vae_stride[2] //
            self.patch_size[2] * self.patch_size[2])
        h = lat_h * self.vae_stride[1]
        w = lat_w * self.vae_stride[2]
--- a/wan/modules/model.py
+++ b/wan/modules/model.py
@ -963,7 +963,7 @@ class WanModel(ModelMixin, ConfigMixin):
            hints_list = [None ] *len(x_list)
        else:
            # Vace embeddings
-            c = [self.vace_patch_embedding(u.unsqueeze(0)) for u in vace_context]
+            c = [self.vace_patch_embedding(u.to(self.vace_patch_embedding.weight.dtype).unsqueeze(0)) for u in vace_context]
            c = [u.flatten(2).transpose(1, 2) for u in c]
            c = c[0]
--- a/wan/text2video.py
+++ b/wan/text2video.py
@ -177,15 +177,16 @@ class WanT2V:
    def vace_latent(self, z, m):
        return [torch.cat([zz, mm], dim=0) for zz, mm in zip(z, m)]
-    def prepare_source(self, src_video, src_mask, src_ref_images, total_frames, image_size,  device, original_video = False, keep_frames= [], start_frame = 0, pre_src_video = None):
+    def prepare_source(self, src_video, src_mask, src_ref_images, total_frames, image_size,  device, original_video = False, keep_frames= [], start_frame = 0,  fit_into_canvas = True, pre_src_video = None):
        image_sizes = []
        trim_video = len(keep_frames)
        canvas_height, canvas_width = image_size
        for i, (sub_src_video, sub_src_mask, sub_pre_src_video) in enumerate(zip(src_video, src_mask,pre_src_video)):
            prepend_count = 0 if sub_pre_src_video == None else sub_pre_src_video.shape[1]
            num_frames = total_frames - prepend_count 
            if sub_src_mask is not None and sub_src_video is not None:
-                src_video[i], src_mask[i], _, _, _ = self.vid_proc.load_video_pair(sub_src_video, sub_src_mask, max_frames= num_frames, trim_video = trim_video - prepend_count, start_frame = start_frame)
+                src_video[i], src_mask[i], _, _, _ = self.vid_proc.load_video_pair(sub_src_video, sub_src_mask, max_frames= num_frames, trim_video = trim_video - prepend_count, start_frame = start_frame, canvas_height = canvas_height, canvas_width = canvas_width, fit_into_canvas = fit_into_canvas)
                # src_video is [-1, 1], 0 = inpainting area (in fact 127  in [0, 255])
                # src_mask is [-1, 1], 0 = preserve original video (in fact 127  in [0, 255]) and 1 = Inpainting (in fact 255  in [0, 255])
                src_video[i] = src_video[i].to(device)
@ -208,7 +209,7 @@ class WanT2V:
                    src_mask[i] = torch.ones_like(src_video[i], device=device)
                image_sizes.append(image_size)
            else:
-                src_video[i], _, _, _ = self.vid_proc.load_video(sub_src_video, max_frames= num_frames, trim_video = trim_video - prepend_count, start_frame = start_frame)
+                src_video[i], _, _, _ = self.vid_proc.load_video(sub_src_video, max_frames= num_frames, trim_video = trim_video - prepend_count, start_frame = start_frame, canvas_height = canvas_height, canvas_width = canvas_width, fit_into_canvas = fit_into_canvas)
                src_video[i] = src_video[i].to(device)
                src_mask[i] = torch.zeros_like(src_video[i], device=device) if original_video else torch.ones_like(src_video[i], device=device)
                if prepend_count > 0:
@ -277,6 +278,7 @@ class WanT2V:
                target_camera=None,                  
                context_scale=1.0,
                size=(1280, 720),
                fit_into_canvas = True,
                frame_num=81,
                shift=5.0,
                sample_solver='unipc',
@ -430,7 +432,7 @@ class WanT2V:
            kwargs.update({'cam_emb': cam_emb})
        if vace:
-            ref_images_count = len(input_ref_images[0]) if input_ref_images != None else 0 
+            ref_images_count = len(input_ref_images[0]) if input_ref_images != None and input_ref_images[0] != None else 0 
            kwargs.update({'vace_context' : z, 'vace_context_scale' : context_scale})
            if overlapped_latents > 0:
                z_reactive = [  zz[0:16, ref_images_count:overlapped_latents + ref_images_count].clone() for zz in z]
--- a/wan/utils/utils.py
+++ b/wan/utils/utils.py
@ -67,7 +67,17 @@ def remove_background(img, session=None):
    return torch.from_numpy(np.array(img).astype(np.float32) / 255.0).movedim(-1, 0)
 def calculate_new_dimensions(canvas_height, canvas_width, height, width, fit_into_canvas):
    if fit_into_canvas:
        scale1  = min(canvas_height / height, canvas_width / width)
        scale2  = min(canvas_width / height, canvas_height / width)
        scale = max(scale1, scale2) 
    else:
        scale = (canvas_height * canvas_width / (height * width))**(1/2)
    new_height = round( height * scale / 16) * 16
    new_width = round( width * scale / 16) * 16
    return new_height, new_width
 def resize_and_remove_background(img_list, budget_width, budget_height, rm_background, fit_into_canvas = False ):
    if rm_background:
--- a/wan/utils/vace_preprocessor.py
+++ b/wan/utils/vace_preprocessor.py
@ -5,6 +5,7 @@ from PIL import Image
 import torch
 import torch.nn.functional as F
 import torchvision.transforms.functional as TF
 from .utils import calculate_new_dimensions
 class VaceImageProcessor(object):
@ -182,53 +183,22 @@ class VaceVideoProcessor(object):
-    def _get_frameid_bbox_adjust_last(self, fps, video_frames_count, h, w, crop_box, rng, max_frames= 0, start_frame =0):
+    def _get_frameid_bbox_adjust_last(self, fps, video_frames_count, canvas_height, canvas_width, h, w,  fit_into_canvas, crop_box, rng, max_frames= 0, start_frame =0):
        from wan.utils.utils import resample
        target_fps = self.max_fps
        # video_frames_count = len(frame_timestamps)
        frame_ids= resample(fps, video_frames_count, max_frames, target_fps, start_frame )
        x1, x2, y1, y2 = [0, w, 0, h] if crop_box is None else crop_box
        h, w = y2 - y1, x2 - x1
-        ratio = h / w
+        oh, ow = calculate_new_dimensions(canvas_height, canvas_width, h, w, fit_into_canvas)
        df, dh, dw = self.downsample
        seq_len  = self.seq_len
        # min/max area of the [latent video]
        min_area_z = self.min_area / (dh * dw)
        # max_area_z = min(seq_len, self.max_area / (dh * dw), (h // dh) * (w // dw))
        max_area_z = min_area_z # workaround bug
        # sample a frame number of the [latent video]
        rand_area_z = np.square(np.power(2, rng.uniform(
            np.log2(np.sqrt(min_area_z)),
            np.log2(np.sqrt(max_area_z))
        )))
        seq_len =  max_area_z * ((max_frames- start_frame - 1) // df +1)
        # of = min(
        #     (len(frame_ids) - 1) // df + 1,
        #     int(seq_len / rand_area_z)
        # )
        of = (len(frame_ids) - 1) // df + 1
        # deduce target shape of the [latent video]
        # target_area_z = min(max_area_z, int(seq_len / of))
        target_area_z = max_area_z        
        oh = round(np.sqrt(target_area_z * ratio))
        ow = int(target_area_z / oh)
        of = (of - 1) * df + 1
        oh *= dh
        ow *= dw
        return frame_ids, (x1, x2, y1, y2), (oh, ow), target_fps
-    def _get_frameid_bbox(self, fps, video_frames_count, h, w, crop_box, rng, max_frames= 0, start_frame= 0):
+    def _get_frameid_bbox(self, fps, video_frames_count, h, w, crop_box, rng, max_frames= 0, start_frame= 0, canvas_height = 0, canvas_width = 0, fit_into_canvas= True):
        if self.keep_last:
-            return self._get_frameid_bbox_adjust_last(fps, video_frames_count, h, w, crop_box, rng, max_frames= max_frames, start_frame= start_frame)
+            return self._get_frameid_bbox_adjust_last(fps, video_frames_count, canvas_height, canvas_width, h, w, fit_into_canvas, crop_box, rng, max_frames= max_frames, start_frame= start_frame)
        else:
            return self._get_frameid_bbox_default(fps, video_frames_count, h, w, crop_box, rng, max_frames= max_frames)
@ -238,7 +208,7 @@ class VaceVideoProcessor(object):
    def load_video_pair(self, data_key, data_key2, crop_box=None, seed=2024, **kwargs):
        return self.load_video_batch(data_key, data_key2, crop_box=crop_box, seed=seed, **kwargs)
-    def load_video_batch(self, *data_key_batch, crop_box=None, seed=2024, max_frames= 0, trim_video =0, start_frame = 0, **kwargs):
+    def load_video_batch(self, *data_key_batch, crop_box=None, seed=2024, max_frames= 0, trim_video =0, start_frame = 0, canvas_height = 0, canvas_width = 0, fit_into_canvas = False, **kwargs):
        rng = np.random.default_rng(seed + hash(data_key_batch[0]) % 10000)
        # read video
        import decord
@ -269,7 +239,7 @@ class VaceVideoProcessor(object):
            h, w = src_video.shape[1:3]
        else:
            h, w = readers[0].next().shape[:2]
-        frame_ids, (x1, x2, y1, y2), (oh, ow), fps = self._get_frameid_bbox(fps, length, h, w, crop_box, rng, max_frames=max_frames, start_frame = start_frame )
+        frame_ids, (x1, x2, y1, y2), (oh, ow), fps = self._get_frameid_bbox(fps, length, h, w, crop_box, rng,  canvas_height = canvas_height, canvas_width = canvas_width, fit_into_canvas = fit_into_canvas,  max_frames=max_frames, start_frame = start_frame )
        # preprocess video
        videos = [reader.get_batch(frame_ids)[:, y1:y2, x1:x2, :] for reader in readers]
--- a/wgp.py
+++ b/wgp.py
@ -84,7 +84,6 @@ def format_time(seconds):
        hours = int(seconds // 3600)
        minutes = int((seconds % 3600) // 60)
        return f"{hours}h {minutes}m"
 def pil_to_base64_uri(pil_image, format="png", quality=75):
    if pil_image is None:
        return None
@ -275,12 +274,12 @@ def process_prompt_and_add_tasks(state, model_choice):
        video_guide = inputs["video_guide"]
        video_mask = inputs["video_mask"]
-        if "1.3B" in model_filename :                
+        # if "1.3B" in model_filename :                
-            resolution_reformated = str(height) + "*" + str(width) 
+        #     resolution_reformated = str(height) + "*" + str(width) 
-            if not resolution_reformated in VACE_SIZE_CONFIGS:
+        #     if not resolution_reformated in VACE_SIZE_CONFIGS:
-                res = (" and ").join(VACE_SIZE_CONFIGS.keys())
+        #         res = (" and ").join(VACE_SIZE_CONFIGS.keys())
-                gr.Info(f"Video Resolution for Vace model is not supported. Only {res} resolutions are allowed.")
+        #         gr.Info(f"Video Resolution for Vace model is not supported. Only {res} resolutions are allowed.")
-                return
+        #         return
        if "I" in video_prompt_type:
            if image_refs == None:
                gr.Info("You must provide at least one Refererence Image")
@ -1995,7 +1994,8 @@ def apply_changes(  state,
                    boost_choice = 1,
                    clear_file_list = 0,
                    preload_model_policy_choice = 1,
-                    UI_theme_choice = "default"
+                    UI_theme_choice = "default",
                    fit_canvas_choice = 0
 ):
    if args.lock_config:
        return
@ -2016,7 +2016,8 @@ def apply_changes(  state,
                     "boost" : boost_choice,
                     "clear_file_list" : clear_file_list,
                     "preload_model_policy" : preload_model_policy_choice,
-                     "UI_theme" : UI_theme_choice
+                     "UI_theme" : UI_theme_choice,
                     "fit_canvas": fit_canvas_choice,
                       }
    if Path(server_config_filename).is_file():
@ -2050,7 +2051,7 @@ def apply_changes(  state,
    transformer_quantization = server_config["transformer_quantization"]
    transformer_types = server_config["transformer_types"]
-    if  all(change in ["attention_mode", "vae_config", "boost", "save_path", "metadata_type", "clear_file_list"] for change in changes ):
+    if  all(change in ["attention_mode", "vae_config", "boost", "save_path", "metadata_type", "clear_file_list", "fit_canvas"] for change in changes ):
        model_choice = gr.Dropdown()
    else:
        reload_needed = True
@ -2413,7 +2414,7 @@ def generate_video(
    file_list = gen["file_list"]
    prompt_no = gen["prompt_no"]
-
+    fit_canvas = server_config.get("fit_canvas", 0)
    # if wan_model == None:
    #     gr.Info("Unable to generate a Video while a new configuration is being applied.")
    #     return
@ -2555,7 +2556,7 @@ def generate_video(
    source_video = None
    target_camera = None
    if "recam" in model_filename:
-        source_video = preprocess_video("", width=width, height=height,video_in=video_source, max_frames= video_length, start_frame = 0, fit_canvas= True)
+        source_video = preprocess_video("", width=width, height=height,video_in=video_source, max_frames= video_length, start_frame = 0, fit_canvas= fit_canvas)
        target_camera = model_mode
    audio_proj_split = None
@ -2646,7 +2647,7 @@ def generate_video(
            elif diffusion_forcing:
                if video_source != None and len(video_source) > 0 and window_no == 1:
                    keep_frames_video_source= 1000 if len(keep_frames_video_source) ==0 else int(keep_frames_video_source) 
-                    prefix_video  = preprocess_video(None, width=width, height=height,video_in=video_source, max_frames= keep_frames_video_source , start_frame = 0, fit_canvas= True, target_fps = fps)
+                    prefix_video  = preprocess_video(None, width=width, height=height,video_in=video_source, max_frames= keep_frames_video_source , start_frame = 0, fit_canvas= fit_canvas, target_fps = fps)
                    prefix_video  = prefix_video .permute(3, 0, 1, 2)
                    prefix_video  = prefix_video .float().div_(127.5).sub_(1.) # c, f, h, w
                    prefix_video_frames_count = prefix_video.shape[1]
@ -2675,13 +2676,13 @@ def generate_video(
                    if preprocess_type != None :
                        send_cmd("progress", progress_args)
-                        video_guide_copy = preprocess_video(preprocess_type, width=width, height=height,video_in=video_guide, max_frames= video_length if window_no == 1 else video_length - reuse_frames, start_frame = guide_start_frame, fit_canvas = True, target_fps = fps)
+                        video_guide_copy = preprocess_video(preprocess_type, width=width, height=height,video_in=video_guide, max_frames= video_length if window_no == 1 else video_length - reuse_frames, start_frame = guide_start_frame, fit_canvas = fit_canvas, target_fps = fps)
                keep_frames_parsed, error = parse_keep_frames_video_guide(keep_frames_video_guide, max_frames_to_generate)
                if len(error) > 0:
                    raise gr.Error(f"invalid keep frames {keep_frames_video_guide}")
                keep_frames_parsed = keep_frames_parsed[guide_start_frame: guide_start_frame + video_length]
                if window_no == 1:
-                    image_size = VACE_SIZE_CONFIGS[resolution_reformated] # default frame dimensions until it is set by video_src (if there is any)
+                    image_size = (height, width) # VACE_SIZE_CONFIGS[resolution_reformated] # default frame dimensions until it is set by video_src (if there is any)
                src_video, src_mask, src_ref_images = wan_model.prepare_source([video_guide_copy],
                                                                        [video_mask_copy ],
                                                                        [image_refs_copy], 
@ -2689,10 +2690,11 @@ def generate_video(
                                                                        original_video= "O" in video_prompt_type,
                                                                        keep_frames=keep_frames_parsed,
                                                                        start_frame = guide_start_frame,
-                                                                        pre_src_video = [pre_video_guide]
+                                                                        pre_src_video = [pre_video_guide],
                                                                        fit_into_canvas = fit_canvas 
                                                                        )
-                if window_no == 1 and src_video != None and len(src_video) > 0:
+                # if window_no == 1 and src_video != None and len(src_video) > 0:
-                    image_size = src_video[0].shape[-2:]
+                #     image_size = src_video[0].shape[-2:]
            prompts_max = gen["prompts_max"]
            status = get_latest_status(state)
@ -2722,6 +2724,7 @@ def generate_video(
                        # max_area=MAX_AREA_CONFIGS[resolution_reformated], 
                        height =  height,
                        width = width,
                        fit_into_canvas = fit_canvas,
                        shift=flow_shift,
                        sampling_steps=num_inference_steps,
                        guide_scale=guidance_scale,
@ -2750,6 +2753,7 @@ def generate_video(
                        input_video= pre_video_guide,
                        height =  height,
                        width = width,
                        fit_into_canvas = fit_canvas,
                        seed = seed,
                        num_frames =  (video_length // 4)* 4 + 1, #377
                        num_inference_steps = num_inference_steps,
@ -2777,6 +2781,7 @@ def generate_video(
                        target_camera= target_camera,
                        frame_num=(video_length // 4)* 4 + 1,
                        size=(width, height),
                        fit_into_canvas = fit_canvas,
                        shift=flow_shift,
                        sampling_steps=num_inference_steps,
                        guide_scale=guidance_scale,
@ -4042,17 +4047,13 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non
                wizard_prompt_activated_var = gr.Text(wizard_prompt_activated, visible= False)
                wizard_variables_var = gr.Text(wizard_variables, visible = False)
            with gr.Row():
-                if test_class_i2v(model_filename) and False:
+                if test_class_i2v(model_filename):
-                    resolution = gr.Dropdown(
+                    if server_config.get("fit_canvas", 0) == 1:
-                        choices=[
+                        label = "Max Resolution (as it maybe less depending on video width / height ratio)"
                            # 720p
                            ("720p (same amount of pixels)", "1280x720"),
                            ("480p (same amount of pixels)", "832x480"),
                        ],
                        value=ui_defaults.get("resolution","480p"),
                        label="Resolution (video will have the same height / width ratio than the original image)"
                    )
                    else:
                        label = "Max Resolution (as it maybe less depending on video width / height ratio)" 
                else:
                    label = "Max Resolution (as it maybe less depending on video width / height ratio)" 
                resolution = gr.Dropdown(
                    choices=[
                        # 720p
@ -4073,7 +4074,7 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non
                        ("512x512 (1:1, 480p)", "512x512"),
                    ],
                    value=ui_defaults.get("resolution","832x480"),
-                        label="Max Resolution (as it maybe less depending on video width / height ratio)" if  test_class_i2v(model_filename) else "Resolution"
+                    label= label 
                )
            with gr.Row():
                if recammaster:
@ -4556,6 +4557,10 @@ def generate_configuration_tab(state, blocks, header, model_choice):
    with gr.Column():
        model_list = []
        with gr.Tabs():
            # with gr.Row(visible=advanced_ui) as advanced_row:
            with gr.Tab("General"):
                for model_type in model_types:
                    choice = get_model_filename(model_type, transformer_quantization)
                    model_list.append(choice)
@ -4568,6 +4573,82 @@ def generate_configuration_tab(state, blocks, header, model_choice):
                    multiselect= True
                    )
                fit_canvas_choice = gr.Dropdown(
                    choices=[
                        ("Dimensions correspond to the Pixels Budget (as the Prompt Image/Video will be resized to match this pixels budget, output video height or width may exceed the requested dimensions )", 0),
                        ("Dimensions correspond to the Maximum Width and Height (as the Prompt Image/Video will be resized to fit into these dimensions, the output video may be smaller)", 1),
                    ],
                    value= server_config.get("fit_canvas", 0),
                    label="Generated Video Dimensions when Prompt contains an Image or a Video",
                    interactive= not lock_ui_attention
                 )
                def check(mode): 
                    if not mode in attention_modes_installed:
                        return " (NOT INSTALLED)"
                    elif not mode in attention_modes_supported:
                        return " (NOT SUPPORTED)"
                    else:
                        return ""
                attention_choice = gr.Dropdown(
                    choices=[
                        ("Auto : pick sage2 > sage > sdpa depending on what is installed", "auto"),
                        ("Scale Dot Product Attention: default, always available", "sdpa"),
                        ("Flash" + check("flash")+ ": good quality - requires additional install (usually complex to set up on Windows without WSL)", "flash"),
                        ("Xformers" + check("xformers")+ ": good quality - requires additional install (usually complex, may consume less VRAM to set up on Windows without WSL)", "xformers"),
                        ("Sage" + check("sage")+ ": 30% faster but slightly worse quality - requires additional install (usually complex to set up on Windows without WSL)", "sage"),
                        ("Sage2" + check("sage2")+ ": 40% faster but slightly worse quality - requires additional install (usually complex to set up on Windows without WSL)", "sage2"),
                    ],
                    value= attention_mode,
                    label="Attention Type",
                    interactive= not lock_ui_attention
                 )
                metadata_choice = gr.Dropdown(
                    choices=[
                        ("Export JSON files", "json"),
                        ("Add metadata to video", "metadata"),
                        ("Neither", "none")
                    ],
                    value=server_config.get("metadata_type", "metadata"),
                    label="Metadata Handling"
                )
                preload_model_policy_choice = gr.CheckboxGroup([("Preload Model while Launching the App","P"), ("Preload Model while Switching Model", "S"), ("Unload Model when Queue is Done", "U")],
                    value=server_config.get("preload_model_policy",[]),
                    label="RAM Loading / Unloading Model Policy (in any case VRAM will be freed once the queue has been processed)"
                )
                clear_file_list_choice = gr.Dropdown(
                    choices=[
                        ("None", 0),
                        ("Keep the last video", 1),
                        ("Keep the last 5 videos", 5),
                        ("Keep the last 10 videos", 10),
                        ("Keep the last 20 videos", 20),
                        ("Keep the last 30 videos", 30),
                    ],
                    value=server_config.get("clear_file_list", 5),
                    label="Keep Previously Generated Videos when starting a new Generation Batch"
                )
                UI_theme_choice = gr.Dropdown(
                    choices=[
                        ("Blue Sky", "default"),
                        ("Classic Gradio", "gradio"),
                    ],
                    value=server_config.get("UI_theme_choice", "default"),
                    label="User Interface Theme. You will need to restart the App the see new Theme."
                )
                save_path_choice = gr.Textbox(
                    label="Output Folder for Generated Videos",
                    value=server_config.get("save_path", save_path)
                )
            with gr.Tab("Performance"):
                quantization_choice = gr.Dropdown(
                    choices=[
                        ("Scaled Int8 Quantization (recommended)", "int8"),
@ -4606,31 +4687,7 @@ def generate_configuration_tab(state, blocks, header, model_choice):
                    label="VAE Encoding / Decoding precision"
                )
-        save_path_choice = gr.Textbox(
+                gr.Text("Beware: when restarting the server or changing a resolution or video duration, the first step of generation for a duration / resolution may last a few minutes due to recompilation", interactive= False, show_label= False )
            label="Output Folder for Generated Videos",
            value=server_config.get("save_path", save_path)
        )
        def check(mode): 
            if not mode in attention_modes_installed:
                return " (NOT INSTALLED)"
            elif not mode in attention_modes_supported:
                return " (NOT SUPPORTED)"
            else:
                return ""
        attention_choice = gr.Dropdown(
            choices=[
                ("Auto : pick sage2 > sage > sdpa depending on what is installed", "auto"),
                ("Scale Dot Product Attention: default, always available", "sdpa"),
                ("Flash" + check("flash")+ ": good quality - requires additional install (usually complex to set up on Windows without WSL)", "flash"),
                ("Xformers" + check("xformers")+ ": good quality - requires additional install (usually complex, may consume less VRAM to set up on Windows without WSL)", "xformers"),
                ("Sage" + check("sage")+ ": 30% faster but slightly worse quality - requires additional install (usually complex to set up on Windows without WSL)", "sage"),
                ("Sage2" + check("sage2")+ ": 40% faster but slightly worse quality - requires additional install (usually complex to set up on Windows without WSL)", "sage2"),
            ],
            value= attention_mode,
            label="Attention Type",
            interactive= not lock_ui_attention
         )
        gr.Markdown("Beware: when restarting the server or changing a resolution or video duration, the first step of generation for a duration / resolution may last a few minutes due to recompilation")
                compile_choice = gr.Dropdown(
                    choices=[
                        ("ON: works only on Linux / WSL", "transformer"),
@ -4640,6 +4697,7 @@ def generate_configuration_tab(state, blocks, header, model_choice):
                    label="Compile Transformer (up to 50% faster and 30% more frames but requires Linux / WSL and Flash or Sage attention)",
                    interactive= not lock_ui_compile
                )              
                vae_config_choice = gr.Dropdown(
                    choices=[
                ("Auto", 0),
@ -4650,6 +4708,7 @@ def generate_configuration_tab(state, blocks, header, model_choice):
                    value= vae_config,
                    label="VAE Tiling - reduce the high VRAM requirements for VAE decoding and VAE encoding (if enabled it will be slower)"
                )
                boost_choice = gr.Dropdown(
                    choices=[
                        # ("Auto (ON if Video longer than 5s)", 0),
@ -4657,8 +4716,9 @@ def generate_configuration_tab(state, blocks, header, model_choice):
                        ("OFF", 2), 
                    ],
                    value=boost,
-            label="Boost: Give a 10% speed speedup without losing quality at the cost of a litle VRAM (up to 1GB for max frames and resolution)"
+                    label="Boost: Give a 10% speedup without losing quality at the cost of a litle VRAM (up to 1GB at max frames and resolution)"
                )
                profile_choice = gr.Dropdown(
                    choices=[
                ("HighRAM_HighVRAM, profile 1: at least 48 GB of RAM and 24 GB of VRAM, the fastest for short videos a RTX 3090 / RTX 4090", 1),
@ -4671,41 +4731,7 @@ def generate_configuration_tab(state, blocks, header, model_choice):
                    label="Profile (for power users only, not needed to change it)"
                )
        metadata_choice = gr.Dropdown(
            choices=[
                ("Export JSON files", "json"),
                ("Add metadata to video", "metadata"),
                ("Neither", "none")
            ],
            value=server_config.get("metadata_type", "metadata"),
            label="Metadata Handling"
        )
        preload_model_policy_choice = gr.CheckboxGroup([("Preload Model while Launching the App","P"), ("Preload Model while Switching Model", "S"), ("Unload Model when Queue is Done", "U")],
            value=server_config.get("preload_model_policy",[]),
            label="RAM Loading / Unloading Model Policy (in any case VRAM will be freed once the queue has been processed)"
        )
        clear_file_list_choice = gr.Dropdown(
            choices=[
                ("None", 0),
                ("Keep the last video", 1),
                ("Keep the last 5 videos", 5),
                ("Keep the last 10 videos", 10),
                ("Keep the last 20 videos", 20),
                ("Keep the last 30 videos", 30),
            ],
            value=server_config.get("clear_file_list", 5),
            label="Keep Previously Generated Videos when starting a Generation Batch"
        )
        UI_theme_choice = gr.Dropdown(
            choices=[
                ("Blue Sky", "default"),
                ("Classic Gradio", "gradio"),
            ],
            value=server_config.get("UI_theme_choice", "default"),
            label="User Interface Theme. You will need to restart the App the see new Theme."
        )
        msg = gr.Markdown()            
@ -4728,7 +4754,8 @@ def generate_configuration_tab(state, blocks, header, model_choice):
                    boost_choice,
                    clear_file_list_choice,
                    preload_model_policy_choice,
-                    UI_theme_choice
+                    UI_theme_choice,
                    fit_canvas_choice
                ],
                outputs= [msg , header, model_choice]
        )