diff --git a/models/flux/flux_handler.py b/models/flux/flux_handler.py
index 162ec4c..b8b7b9b 100644
--- a/models/flux/flux_handler.py
+++ b/models/flux/flux_handler.py
@@ -26,12 +26,13 @@ class family_handler():
             model_def_output["no_background_removal"] = True
 
             model_def_output["image_ref_choices"] = {
-                "choices":[("No Reference Image", ""),("First Image is a Reference Image, and then the next ones (up to two) are Style Images", "I"),
-                            ("Up to two Images are Style Images", "IJ")],
-                "default": "I",
-                "letters_filter": "IJ",
+                "choices":[("No Reference Image", ""),("First Image is a Reference Image, and then the next ones (up to two) are Style Images", "KI"),
+                            ("Up to two Images are Style Images", "KIJ")],
+                "default": "KI",
+                "letters_filter": "KIJ",
                 "label": "Reference Images / Style Images"
             }
+        model_def_output["lock_image_refs_ratios"] = True
 
         return model_def_output
 
@@ -107,6 +108,16 @@ class family_handler():
             pipe["feature_embedder"] = flux_model.feature_embedder 
         return flux_model, pipe
 
+    @staticmethod
+    def fix_settings(base_model_type, settings_version, model_def, ui_defaults):
+        flux_model = model_def.get("flux-model", "flux-dev")
+        flux_uso = flux_model == "flux-dev-uso"
+        if flux_uso and settings_version < 2.29:
+            video_prompt_type = ui_defaults.get("video_prompt_type", "")
+            if "I" in video_prompt_type:
+                video_prompt_type = video_prompt_type.replace("I", "KI")
+                ui_defaults["video_prompt_type"] = video_prompt_type 
+
     @staticmethod
     def update_default_settings(base_model_type, model_def, ui_defaults):
         flux_model = model_def.get("flux-model", "flux-dev")
@@ -116,6 +127,6 @@ class family_handler():
         })            
         if model_def.get("reference_image", False):
             ui_defaults.update({
-                "video_prompt_type": "I" if flux_uso else "KI",
+                "video_prompt_type": "KI",
             })
 
diff --git a/models/flux/flux_main.py b/models/flux/flux_main.py
index 55a2b91..9bb8e73 100644
--- a/models/flux/flux_main.py
+++ b/models/flux/flux_main.py
@@ -9,6 +9,9 @@ from shared.utils.utils import calculate_new_dimensions
 from .sampling import denoise, get_schedule, prepare_kontext, prepare_prompt, prepare_multi_ip, unpack
 from .modules.layers import get_linear_split_map
 from transformers import SiglipVisionModel, SiglipImageProcessor
+import torchvision.transforms.functional as TVF
+import math
+from shared.utils.utils import convert_image_to_tensor, convert_tensor_to_image
 
 from .util import (
     aspect_ratio_to_height_width,
@@ -21,6 +24,44 @@ from .util import (
 
 from PIL import Image
 
+def resize_and_centercrop_image(image, target_height_ref1, target_width_ref1):
+    target_height_ref1 = int(target_height_ref1 // 64 * 64)
+    target_width_ref1 = int(target_width_ref1 // 64 * 64)
+    h, w = image.shape[-2:]
+    if h < target_height_ref1 or w < target_width_ref1:
+        # 计算长宽比
+        aspect_ratio = w / h
+        if h < target_height_ref1:
+            new_h = target_height_ref1
+            new_w = new_h * aspect_ratio
+            if new_w < target_width_ref1:
+                new_w = target_width_ref1
+                new_h = new_w / aspect_ratio
+        else:
+            new_w = target_width_ref1
+            new_h = new_w / aspect_ratio
+            if new_h < target_height_ref1:
+                new_h = target_height_ref1
+                new_w = new_h * aspect_ratio
+    else:
+        aspect_ratio = w / h
+        tgt_aspect_ratio = target_width_ref1 / target_height_ref1
+        if aspect_ratio > tgt_aspect_ratio:
+            new_h = target_height_ref1
+            new_w = new_h * aspect_ratio
+        else:
+            new_w = target_width_ref1
+            new_h = new_w / aspect_ratio
+    # 使用 TVF.resize 进行图像缩放
+    image = TVF.resize(image, (math.ceil(new_h), math.ceil(new_w)))
+    # 计算中心裁剪的参数
+    top = (image.shape[-2] - target_height_ref1) // 2
+    left = (image.shape[-1] - target_width_ref1) // 2
+    # 使用 TVF.crop 进行中心裁剪
+    image = TVF.crop(image, top, left, target_height_ref1, target_width_ref1)
+    return image
+
+
 def stitch_images(img1, img2):
     # Resize img2 to match img1's height
     width1, height1 = img1.size
@@ -129,11 +170,11 @@ class model_factory:
             if n_prompt is None or len(n_prompt) == 0: n_prompt = "low quality, ugly, unfinished, out of focus, deformed, disfigure, blurry, smudged, restricted palette, flat colors"
             device="cuda"
             flux_dev_uso = self.name in ['flux-dev-uso']
-            image_stiching =  not self.name in ['flux-dev-uso']
-
+            image_stiching =  not self.name in ['flux-dev-uso'] #and False
+            # image_refs_relative_size = 100
+            crop = False
             input_ref_images = [] if input_ref_images is None else input_ref_images[:]
             ref_style_imgs = []
-
             if "I" in video_prompt_type and len(input_ref_images) > 0: 
                 if flux_dev_uso :
                     if "J" in video_prompt_type:
@@ -148,7 +189,7 @@ class model_factory:
                     if "K" in video_prompt_type :
                         w, h = input_ref_images[0].size
                         height, width = calculate_new_dimensions(height, width, h, w, fit_into_canvas)
-
+                        # actual rescale will happen in prepare_kontext
                     for new_img in input_ref_images[1:]:
                         stiched = stitch_images(stiched, new_img)
                     input_ref_images  = [stiched]
@@ -157,14 +198,24 @@ class model_factory:
                     if "K" in video_prompt_type:
                         # image latents tiling method
                         w, h = input_ref_images[0].size
-                        height, width = calculate_new_dimensions(height, width, h, w, fit_into_canvas)
-                        input_ref_images[0] = input_ref_images[0].resize((width, height), resample=Image.Resampling.LANCZOS) 
+                        if crop :
+                            img = convert_image_to_tensor(input_ref_images[0])
+                            img = resize_and_centercrop_image(img, height, width)                       
+                            input_ref_images[0] = convert_tensor_to_image(img)                    
+                        else:
+                            height, width = calculate_new_dimensions(height, width, h, w, fit_into_canvas)
+                            input_ref_images[0] = input_ref_images[0].resize((width, height), resample=Image.Resampling.LANCZOS) 
                         first_ref = 1
 
                     for i in range(first_ref,len(input_ref_images)):
                         w, h = input_ref_images[i].size
-                        image_height, image_width = calculate_new_dimensions(int(height*image_refs_relative_size/100), int(width*image_refs_relative_size/100), h, w, fit_into_canvas)
-                        input_ref_images[0] = input_ref_images[0].resize((image_width, image_height), resample=Image.Resampling.LANCZOS) 
+                        if crop:
+                            img = convert_image_to_tensor(input_ref_images[i])
+                            img = resize_and_centercrop_image(img, int(height*image_refs_relative_size/100), int(width*image_refs_relative_size/100)) 
+                            input_ref_images[i] = convert_tensor_to_image(img)                    
+                        else:
+                            image_height, image_width = calculate_new_dimensions(int(height*image_refs_relative_size/100), int(width*image_refs_relative_size/100), h, w, fit_into_canvas)
+                            input_ref_images[i] = input_ref_images[i].resize((image_width, image_height), resample=Image.Resampling.LANCZOS) 
             else:
                 input_ref_images = None
 
diff --git a/models/flux/sampling.py b/models/flux/sampling.py
index 5534e9f..f43ae15 100644
--- a/models/flux/sampling.py
+++ b/models/flux/sampling.py
@@ -153,7 +153,6 @@ def prepare_kontext(
 
         # Kontext is trained on specific resolutions, using one of them is recommended
         _, width, height = min((abs(aspect_ratio - w / h), w, h) for w, h in PREFERED_KONTEXT_RESOLUTIONS)
-
         width = 2 * int(width / 16)
         height = 2 * int(height / 16)
 
diff --git a/models/qwen/qwen_handler.py b/models/qwen/qwen_handler.py
index c6004e1..6fc488a 100644
--- a/models/qwen/qwen_handler.py
+++ b/models/qwen/qwen_handler.py
@@ -15,6 +15,7 @@ class family_handler():
                             ("Default", "default"),
                             ("Lightning", "lightning")],
             "guidance_max_phases" : 1,
+            "lock_image_refs_ratios": True,
         }
 
 
diff --git a/models/wan/wan_handler.py b/models/wan/wan_handler.py
index 6d91fe2..9adc3a8 100644
--- a/models/wan/wan_handler.py
+++ b/models/wan/wan_handler.py
@@ -117,6 +117,8 @@ class family_handler():
             extra_model_def["no_background_removal"] = True
             # extra_model_def["at_least_one_image_ref_needed"] = True
 
+        if base_model_type in ["standin"] or vace_class: 
+            extra_model_def["lock_image_refs_ratios"] = True
 
         # if base_model_type in ["phantom_1.3B", "phantom_14B"]: 
         #     extra_model_def["one_image_ref_needed"] = True
diff --git a/shared/utils/utils.py b/shared/utils/utils.py
index a55807a..7ddf1eb 100644
--- a/shared/utils/utils.py
+++ b/shared/utils/utils.py
@@ -18,11 +18,11 @@ import os
 import tempfile
 import subprocess
 import json
-
+from functools import lru_cache
 
 
 from PIL import Image
-
+video_info_cache = []
 def seed_everything(seed: int):
     random.seed(seed)
     np.random.seed(seed)
@@ -77,7 +77,9 @@ def truncate_for_filesystem(s, max_bytes=255):
         else: r = m - 1
     return s[:l]
 
+@lru_cache(maxsize=100)
 def get_video_info(video_path):
+    global video_info_cache
     import cv2
     cap = cv2.VideoCapture(video_path)
     
diff --git a/wgp.py b/wgp.py
index f11afa9..a3cc510 100644
--- a/wgp.py
+++ b/wgp.py
@@ -60,8 +60,8 @@ AUTOSAVE_FILENAME = "queue.zip"
 PROMPT_VARS_MAX = 10
 
 target_mmgp_version = "3.5.12"
-WanGP_version = "8.32"
-settings_version = 2.28
+WanGP_version = "8.33"
+settings_version = 2.29
 max_source_video_frames = 3000
 prompt_enhancer_image_caption_model, prompt_enhancer_image_caption_processor, prompt_enhancer_llm_model, prompt_enhancer_llm_tokenizer = None, None, None, None
 
@@ -3313,6 +3313,7 @@ def select_video(state, input_file_list, event_data: gr.EventData):
                 if not all_letters(src, pos): return False
                 if neg is not None and any_letters(src, neg): return False
                 return True
+            image_outputs = configs.get("image_mode",0) == 1
             map_video_prompt  = {"V" : "Control Video", ("VA", "U") : "Mask Video", "I" : "Reference Images"}
             map_image_prompt  = {"V" : "Source Video", "L" : "Last Video", "S" : "Start Image", "E" : "End Image"}
             map_audio_prompt  = {"A" : "Audio Source", "B" : "Audio Source #2"}
@@ -3364,6 +3365,7 @@ def select_video(state, input_file_list, event_data: gr.EventData):
                     if multiple_submodels:
                         video_guidance_scale += f" + Model Switch at {video_switch_threshold if video_model_switch_phase ==1 else video_switch_threshold2}"
             video_flow_shift = configs.get("flow_shift", None)
+            if image_outputs: video_flow_shift = None 
             video_video_guide_outpainting = configs.get("video_guide_outpainting", "")
             video_outpainting = ""
             if len(video_video_guide_outpainting) > 0  and not video_video_guide_outpainting.startswith("#") \
@@ -4545,7 +4547,8 @@ def generate_video(
                 send_cmd("progress", [0, get_latest_status(state, "Removing Images References Background")])
             os.environ["U2NET_HOME"] = os.path.join(os.getcwd(), "ckpts", "rembg")
             from shared.utils.utils import resize_and_remove_background
-            image_refs[nb_frames_positions:]  = resize_and_remove_background(image_refs[nb_frames_positions:] , width, height, remove_background_images_ref > 0, any_background_ref, fit_into_canvas= not (any_background_ref or vace or standin) ) # no fit for vace ref images as it is done later
+            # keep image ratios if there is a background image ref (we will let the model preprocessor decide what to do) but remove bg if requested
+            image_refs[nb_frames_positions:]  = resize_and_remove_background(image_refs[nb_frames_positions:] , width, height, remove_background_images_ref > 0, any_background_ref, fit_into_canvas= not (any_background_ref or model_def.get("lock_image_refs_ratios", False)) ) # no fit for vace ref images as it is done later
             update_task_thumbnails(task, locals())
             send_cmd("output")
     joint_pass = boost ==1 #and profile != 1 and profile != 3  
@@ -5912,6 +5915,8 @@ def prepare_inputs_dict(target, inputs, model_type = None, model_filename = None
     if target == "settings":
         return inputs
 
+    image_outputs = inputs.get("image_mode",0) == 1
+
     pop=[]    
     if "force_fps" in inputs and len(inputs["force_fps"])== 0:
         pop += ["force_fps"]
@@ -5977,7 +5982,7 @@ def prepare_inputs_dict(target, inputs, model_type = None, model_filename = None
     if guidance_max_phases < 3 or guidance_phases < 3:
         pop += ["guidance3_scale", "switch_threshold2", "model_switch_phase"]
 
-    if ltxv:
+    if ltxv or image_outputs:
         pop += ["flow_shift"]
 
     if model_def.get("no_negative_prompt", False) :
@@ -6876,11 +6881,15 @@ def detect_auto_save_form(state, evt:gr.SelectData):
         return gr.update()
 
 def compute_video_length_label(fps, current_video_length):
-    return f"Number of frames ({fps} frames = 1s), current duration: {(current_video_length / fps):.1f}s",  
+    if fps is None:
+        return f"Number of frames"
+    else:
+        return f"Number of frames ({fps} frames = 1s), current duration: {(current_video_length / fps):.1f}s",  
 
-def refresh_video_length_label(state, current_video_length):
-    fps = get_model_fps(get_base_model_type(state["model_type"]))
-    return gr.update(label= compute_video_length_label(fps, current_video_length))
+def refresh_video_length_label(state, current_video_length, force_fps, video_guide, video_source):
+    base_model_type = get_base_model_type(state["model_type"])
+    computed_fps = get_computed_fps(force_fps, base_model_type , video_guide, video_source )
+    return gr.update(label= compute_video_length_label(computed_fps, current_video_length))
 
 def generate_video_tab(update_form = False, state_dict = None, ui_defaults = None, model_family = None, model_choice = None, header = None, main = None, main_tabs= None):
     global inputs_names #, advanced
@@ -7469,8 +7478,9 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non
                     
                     current_video_length = ui_defaults.get("video_length", 81 if get_model_family(base_model_type)=="wan" else 97)
 
+                    computed_fps = get_computed_fps(ui_defaults.get("force_fps",""), base_model_type , video_guide, video_source )
                     video_length = gr.Slider(min_frames, get_max_frames(737 if test_any_sliding_window(base_model_type) else 337), value=current_video_length, 
-                         step=frames_step, label=compute_video_length_label(fps, current_video_length) , visible = True, interactive= True)
+                         step=frames_step, label=compute_video_length_label(computed_fps, current_video_length) , visible = True, interactive= True)
 
             with gr.Row(visible = not lock_inference_steps) as inference_steps_row:                                       
                 num_inference_steps = gr.Slider(1, 100, value=ui_defaults.get("num_inference_steps",30), step=1, label="Number of Inference Steps", visible = True)
@@ -7643,7 +7653,7 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non
                             
 
                     with gr.Column(visible = (t2v or vace) and not fantasy) as audio_prompt_type_remux_row:
-                        gr.Markdown("You may transfer the exising audio tracks of a Control Video")
+                        gr.Markdown("You may transfer the existing audio tracks of a Control Video")
                         audio_prompt_type_remux = gr.Dropdown(
                             choices=[
                                 ("No Remux", ""),
@@ -7955,7 +7965,7 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non
         extra_inputs = prompt_vars + [wizard_prompt, wizard_variables_var, wizard_prompt_activated_var, video_prompt_column, image_prompt_column,
                                       prompt_column_advanced, prompt_column_wizard_vars, prompt_column_wizard, lset_name, save_lset_prompt_drop, advanced_row, speed_tab, audio_tab, mmaudio_col, quality_tab,
                                       sliding_window_tab, misc_tab, prompt_enhancer_row, inference_steps_row, skip_layer_guidance_row, audio_guide_row, RIFLEx_setting_col,
-                                      video_prompt_type_video_guide, video_prompt_type_video_guide_alt, video_prompt_type_video_mask, video_prompt_type_image_refs, apg_col, audio_prompt_type_sources, audio_prompt_type_remux_row,
+                                      video_prompt_type_video_guide, video_prompt_type_video_guide_alt, video_prompt_type_video_mask, video_prompt_type_image_refs, apg_col, audio_prompt_type_sources,  audio_prompt_type_remux, audio_prompt_type_remux_row,
                                       video_guide_outpainting_col,video_guide_outpainting_top, video_guide_outpainting_bottom, video_guide_outpainting_left, video_guide_outpainting_right,
                                       video_guide_outpainting_checkbox, video_guide_outpainting_row, show_advanced, video_info_to_control_video_btn, video_info_to_video_source_btn, sample_solver_row,
                                       video_buttons_row, image_buttons_row, video_postprocessing_tab, audio_remuxing_tab, PP_MMAudio_row, PP_custom_audio_row, 
@@ -7975,7 +7985,8 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non
             resolution.change(fn=record_last_resolution, inputs=[state, resolution])
 
 
-            video_length.release(fn=refresh_video_length_label, inputs=[state, video_length ], outputs = video_length, trigger_mode="always_last" )
+            # video_length.release(fn=refresh_video_length_label, inputs=[state, video_length ], outputs = video_length, trigger_mode="always_last" )
+            gr.on(triggers=[video_length.release, force_fps.change, video_guide.change, video_source.change], fn=refresh_video_length_label, inputs=[state, video_length, force_fps, video_guide, video_source] , outputs = video_length, trigger_mode="always_last" )
             guidance_phases.change(fn=change_guidance_phases, inputs= [state, guidance_phases], outputs =[model_switch_phase, guidance_phases_row, switch_threshold, switch_threshold2, guidance2_scale, guidance3_scale ])
             audio_prompt_type_remux.change(fn=refresh_audio_prompt_type_remux, inputs=[state, audio_prompt_type, audio_prompt_type_remux], outputs=[audio_prompt_type])
             audio_prompt_type_sources.change(fn=refresh_audio_prompt_type_sources, inputs=[state, audio_prompt_type, audio_prompt_type_sources], outputs=[audio_prompt_type, audio_guide, audio_guide2, speakers_locations_row])