lucky day

2025-11-04 14:16:57 +00:00 · 2025-08-12 00:52:22 +02:00 · 2025-08-12 00:52:22 +02:00 · 175e05fc1e
commit 175e05fc1e
parent 58c1549962
7 changed files with 564 additions and 356 deletions
--- a/README.md
+++ b/README.md
@ -20,6 +20,20 @@ WanGP supports the Wan (and derived models), Hunyuan Video and LTV Video models
 **Follow DeepBeepMeep on Twitter/X to get the Latest News**: https://x.com/deepbeepmeep

 ## 🔥 Latest Updates : 
+
+### August 11 2025: WanGP v7.77 - Lucky Day
+
+This is your lucky day ! thanks to new configuration options that will let you store generated Videos and Images in lossless compressed formats, you will find they in fact they look two times better without doing anything !
+
+Just kidding, they will be only marginally better, but at least this opens the way to professionnal editing.
+
+Support:
+- Video: x264, x264 lossless, x265
+- Images: jpeg, png, webp, wbp lossless
+Generation Settings are stored in each of the above regardless of the format (that was the hard part).
+
+Also you can now choose different output directories for images and videos.
+
 ### August 10 2025: WanGP v7.76 - Faster than the VAE ...
 We have a funny one here today: FastWan 2.2 5B, the Fastest Video Generator, only 20s to generate 121 frames at 720p. The snag is that VAE is twice as slow... 
 Thanks to Kijai for extracting the Lora that is used to build the corresponding finetune.
--- a/postprocessing/mmaudio/data/av_utils.py
+++ b/postprocessing/mmaudio/data/av_utils.py
@ -131,7 +131,7 @@ from pathlib import Path
 import torch

 def remux_with_audio(video_path: Path, output_path: Path, audio: torch.Tensor, sampling_rate: int):
-    from shared.utils.utils import extract_audio_tracks, combine_video_with_audio_tracks, cleanup_temp_audio_files
+    from shared.utils.audio_video import extract_audio_tracks, combine_video_with_audio_tracks, cleanup_temp_audio_files

    with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as f:
        temp_path = Path(f.name)
--- a/preprocessing/matanyone/app.py
+++ b/preprocessing/matanyone/app.py
@ -21,6 +21,7 @@ from segment_anything.modeling.image_encoder import window_partition, window_unp
 from .utils.get_default_model import get_matanyone_model
 from .matanyone.inference.inference_core import InferenceCore
 from .matanyone_wrapper import matanyone
+from shared.utils.audio_video import save_video, save_image

 arg_device = "cuda"
 arg_sam_model_type="vit_h"
@ -377,14 +378,14 @@ def show_mask(video_state, interactive_state, mask_dropdown):
        return select_frame


-def save_video(frames, output_path, fps):
+# def save_video(frames, output_path, fps):

-    writer = imageio.get_writer( output_path, fps=fps, codec='libx264', quality=8)
-    for frame in frames:
-        writer.append_data(frame)
-    writer.close()
+#     writer = imageio.get_writer( output_path, fps=fps, codec='libx264', quality=8)
+#     for frame in frames:
+#         writer.append_data(frame)
+#     writer.close()

-    return output_path
+#     return output_path

 def mask_to_xyxy_box(mask):
    rows, cols = np.where(mask == 255)
@ -535,20 +536,20 @@ def video_matting(video_state,video_input, end_slider, matting_type, interactive
    file_name= video_state["video_name"]
    file_name = ".".join(file_name.split(".")[:-1]) 
 
-    from shared.utils.utils import extract_audio_tracks, combine_video_with_audio_tracks, cleanup_temp_audio_files    
+    from shared.utils.audio_video import extract_audio_tracks, combine_video_with_audio_tracks, cleanup_temp_audio_files    
    source_audio_tracks, audio_metadata  = extract_audio_tracks(video_input)
    output_fg_path =  f"./mask_outputs/{file_name}_fg.mp4"
    output_fg_temp_path =  f"./mask_outputs/{file_name}_fg_tmp.mp4"
    if len(source_audio_tracks) == 0:
-        foreground_output = save_video(foreground, output_path=output_fg_path , fps=fps)
+        foreground_output = save_video(foreground,output_fg_path , fps=fps, codec_type= video_output_codec)
    else:
-        foreground_output_tmp = save_video(foreground, output_path=output_fg_temp_path , fps=fps)
+        foreground_output_tmp = save_video(foreground, output_fg_temp_path , fps=fps,  codec_type= video_output_codec)
        combine_video_with_audio_tracks(output_fg_temp_path, source_audio_tracks, output_fg_path, audio_metadata=audio_metadata)
        cleanup_temp_audio_files(source_audio_tracks)
        os.remove(foreground_output_tmp)
        foreground_output = output_fg_path

-    alpha_output = save_video(alpha, output_path="./mask_outputs/{}_alpha.mp4".format(file_name), fps=fps)
+    alpha_output = save_video(alpha, "./mask_outputs/{}_alpha.mp4".format(file_name), fps=fps, codec_type= video_output_codec)

    return foreground_output, alpha_output, gr.update(visible=True), gr.update(visible=True), gr.update(visible=True), gr.update(visible=True)

@ -745,8 +746,12 @@ def teleport_to_video_tab(tab_state):
    return gr.Tabs(selected="video_gen")


-def display(tabs, tab_state, vace_video_input, vace_image_input, vace_video_mask, vace_image_mask, vace_image_refs):
+def display(tabs, tab_state, server_config,  vace_video_input, vace_image_input, vace_video_mask, vace_image_mask, vace_image_refs):
    # my_tab.select(fn=load_unload_models, inputs=[], outputs=[])
+    global image_output_codec, video_output_codec
+
+    image_output_codec = server_config.get("image_output_codec", None)
+    video_output_codec = server_config.get("video_output_codec", None)

    media_url = "https://github.com/pq-yang/MatAnyone/releases/download/media/"

--- a/requirements.txt
+++ b/requirements.txt
@ -46,6 +46,7 @@ soundfile
 ffmpeg-python
 pyannote.audio
 pynvml
+piexif
 #huggingface_hub[hf_xet]  #slow down everything !!!!
 # num2words
 # spacy
--- a/shared/utils/audio_video.py
+++ b/shared/utils/audio_video.py
@ -0,0 +1,421 @@
+import subprocess
+import tempfile, os
+import ffmpeg
+import torchvision.transforms.functional as TF
+import torch.nn.functional as F
+import cv2
+import tempfile
+import imageio
+import binascii
+import torchvision
+import torch
+from PIL import Image
+import os.path as osp
+import json
+
+def rand_name(length=8, suffix=''):
+    name = binascii.b2a_hex(os.urandom(length)).decode('utf-8')
+    if suffix:
+        if not suffix.startswith('.'):
+            suffix = '.' + suffix
+        name += suffix
+    return name
+
+
+
+def extract_audio_tracks(source_video, verbose=False, query_only=False):
+    """
+    Extract all audio tracks from a source video into temporary AAC files.
+
+    Returns:
+        Tuple:
+          - List of temp file paths for extracted audio tracks
+          - List of corresponding metadata dicts:
+              {'codec', 'sample_rate', 'channels', 'duration', 'language'}
+              where 'duration' is set to container duration (for consistency).
+    """
+    probe = ffmpeg.probe(source_video)
+    audio_streams = [s for s in probe['streams'] if s['codec_type'] == 'audio']
+    container_duration = float(probe['format'].get('duration', 0.0))
+
+    if not audio_streams:
+        if query_only: return 0
+        if verbose: print(f"No audio track found in {source_video}")
+        return [], []
+
+    if query_only:
+        return len(audio_streams)
+
+    if verbose:
+        print(f"Found {len(audio_streams)} audio track(s), container duration = {container_duration:.3f}s")
+
+    file_paths = []
+    metadata = []
+
+    for i, stream in enumerate(audio_streams):
+        fd, temp_path = tempfile.mkstemp(suffix=f'_track{i}.aac', prefix='audio_')
+        os.close(fd)
+
+        file_paths.append(temp_path)
+        metadata.append({
+            'codec': stream.get('codec_name'),
+            'sample_rate': int(stream.get('sample_rate', 0)),
+            'channels': int(stream.get('channels', 0)),
+            'duration': container_duration,
+            'language': stream.get('tags', {}).get('language', None)
+        })
+
+        ffmpeg.input(source_video).output(
+            temp_path,
+            **{f'map': f'0:a:{i}', 'acodec': 'aac', 'b:a': '128k'}
+        ).overwrite_output().run(quiet=not verbose)
+
+    return file_paths, metadata
+
+
+
+def combine_and_concatenate_video_with_audio_tracks(
+    save_path_tmp, video_path,
+    source_audio_tracks, new_audio_tracks,
+    source_audio_duration, audio_sampling_rate,
+    new_audio_from_start=False,
+    source_audio_metadata=None,
+    audio_bitrate='128k',
+    audio_codec='aac',
+    verbose = False
+):
+    inputs, filters, maps, idx = ['-i', video_path], [], ['-map', '0:v'], 1
+    metadata_args = []
+    sources = source_audio_tracks or []
+    news = new_audio_tracks or []
+
+    duplicate_source = len(sources) == 1 and len(news) > 1
+    N = len(news) if source_audio_duration == 0 else max(len(sources), len(news)) or 1
+
+    for i in range(N):
+        s = (sources[i] if i < len(sources)
+             else sources[0] if duplicate_source else None)
+        n = news[i] if len(news) == N else (news[0] if news else None)
+
+        if source_audio_duration == 0:
+            if n:
+                inputs += ['-i', n]
+                filters.append(f'[{idx}:a]apad=pad_dur=100[aout{i}]')
+                idx += 1
+            else:
+                filters.append(f'anullsrc=r={audio_sampling_rate}:cl=mono,apad=pad_dur=100[aout{i}]')
+        else:
+            if s:
+                inputs += ['-i', s]
+                meta = source_audio_metadata[i] if source_audio_metadata and i < len(source_audio_metadata) else {}
+                needs_filter = (
+                    meta.get('codec') != audio_codec or
+                    meta.get('sample_rate') != audio_sampling_rate or
+                    meta.get('channels') != 1 or
+                    meta.get('duration', 0) < source_audio_duration
+                )
+                if needs_filter:
+                    filters.append(
+                        f'[{idx}:a]aresample={audio_sampling_rate},aformat=channel_layouts=mono,'
+                        f'apad=pad_dur={source_audio_duration},atrim=0:{source_audio_duration},asetpts=PTS-STARTPTS[s{i}]')
+                else:
+                    filters.append(
+                        f'[{idx}:a]apad=pad_dur={source_audio_duration},atrim=0:{source_audio_duration},asetpts=PTS-STARTPTS[s{i}]')
+                if lang := meta.get('language'):
+                    metadata_args += ['-metadata:s:a:' + str(i), f'language={lang}']
+                idx += 1
+            else:
+                filters.append(
+                    f'anullsrc=r={audio_sampling_rate}:cl=mono,atrim=0:{source_audio_duration},asetpts=PTS-STARTPTS[s{i}]')
+
+            if n:
+                inputs += ['-i', n]
+                start = '0' if new_audio_from_start else source_audio_duration
+                filters.append(
+                    f'[{idx}:a]aresample={audio_sampling_rate},aformat=channel_layouts=mono,'
+                    f'atrim=start={start},asetpts=PTS-STARTPTS[n{i}]')
+                filters.append(f'[s{i}][n{i}]concat=n=2:v=0:a=1[aout{i}]')
+                idx += 1
+            else:
+                filters.append(f'[s{i}]apad=pad_dur=100[aout{i}]')
+
+        maps += ['-map', f'[aout{i}]']
+
+    cmd = ['ffmpeg', '-y', *inputs,
+           '-filter_complex', ';'.join(filters),  # ✅ Only change made
+           *maps, *metadata_args,
+           '-c:v', 'copy',
+           '-c:a', audio_codec,
+           '-b:a', audio_bitrate,
+           '-ar', str(audio_sampling_rate),
+           '-ac', '1',
+           '-shortest', save_path_tmp]
+
+    if verbose:
+        print(f"ffmpeg command: {cmd}")
+    try:
+        subprocess.run(cmd, check=True, capture_output=True, text=True)
+    except subprocess.CalledProcessError as e:
+        raise Exception(f"FFmpeg error: {e.stderr}")
+
+
+def combine_video_with_audio_tracks(target_video, audio_tracks, output_video,
+                                     audio_metadata=None, verbose=False):
+    if not audio_tracks:
+        if verbose: print("No audio tracks to combine."); return False
+
+    dur = float(next(s for s in ffmpeg.probe(target_video)['streams']
+                     if s['codec_type'] == 'video')['duration'])
+    if verbose: print(f"Video duration: {dur:.3f}s")
+
+    cmd = ['ffmpeg', '-y', '-i', target_video]
+    for path in audio_tracks:
+        cmd += ['-i', path]
+
+    cmd += ['-map', '0:v']
+    for i in range(len(audio_tracks)):
+        cmd += ['-map', f'{i+1}:a']
+
+    for i, meta in enumerate(audio_metadata or []):
+        if (lang := meta.get('language')):
+            cmd += ['-metadata:s:a:' + str(i), f'language={lang}']
+
+    cmd += ['-c:v', 'copy', '-c:a', 'copy', '-t', str(dur), output_video]
+
+    result = subprocess.run(cmd, capture_output=not verbose, text=True)
+    if result.returncode != 0:
+        raise Exception(f"FFmpeg error:\n{result.stderr}")
+    if verbose:
+        print(f"Created {output_video} with {len(audio_tracks)} audio track(s)")
+    return True
+
+
+def cleanup_temp_audio_files(audio_tracks, verbose=False):
+    """
+    Clean up temporary audio files.
+    
+    Args:
+        audio_tracks: List of audio file paths to delete
+        verbose: Enable verbose output (default: False)
+        
+    Returns:
+        Number of files successfully deleted
+    """
+    deleted_count = 0
+    
+    for audio_path in audio_tracks:
+        try:
+            if os.path.exists(audio_path):
+                os.unlink(audio_path)
+                deleted_count += 1
+                if verbose:
+                    print(f"Cleaned up {audio_path}")
+        except PermissionError:
+            print(f"Warning: Could not delete {audio_path} (file may be in use)")
+        except Exception as e:
+            print(f"Warning: Error deleting {audio_path}: {e}")
+    
+    if verbose and deleted_count > 0:
+        print(f"Successfully deleted {deleted_count} temporary audio file(s)")
+    
+    return deleted_count
+
+
+def save_video(tensor,
+                save_file=None,
+                fps=30,
+                codec_type='libx264_8',
+                container='mp4',
+                nrow=8,
+                normalize=True,
+                value_range=(-1, 1),
+                retry=5):
+    """Save tensor as video with configurable codec and container options."""
+        
+    suffix = f'.{container}'
+    cache_file = osp.join('/tmp', rand_name(suffix=suffix)) if save_file is None else save_file
+    if not cache_file.endswith(suffix):
+        cache_file = osp.splitext(cache_file)[0] + suffix
+    
+    # Configure codec parameters
+    codec_params = _get_codec_params(codec_type, container)
+    
+    # Process and save
+    error = None
+    for _ in range(retry):
+        try:
+            if torch.is_tensor(tensor):
+                # Preprocess tensor
+                tensor = tensor.clamp(min(value_range), max(value_range))
+                tensor = torch.stack([
+                    torchvision.utils.make_grid(u, nrow=nrow, normalize=normalize, value_range=value_range)
+                    for u in tensor.unbind(2)
+                ], dim=1).permute(1, 2, 3, 0)
+                tensor = (tensor * 255).type(torch.uint8).cpu()
+                arrays = tensor.numpy()
+            else:
+                arrays = tensor
+
+            # Write video (silence ffmpeg logs)
+            writer = imageio.get_writer(cache_file, fps=fps, ffmpeg_log_level='error', **codec_params)
+            for frame in arrays:
+                writer.append_data(frame)
+        
+            writer.close()
+            return cache_file
+            
+        except Exception as e:
+            error = e
+            print(f"error saving {save_file}: {e}")
+
+
+def _get_codec_params(codec_type, container):
+    """Get codec parameters based on codec type and container."""
+    if codec_type == 'libx264_8':
+        return {'codec': 'libx264', 'quality': 8, 'pixelformat': 'yuv420p'}
+    elif codec_type == 'libx264_10':
+        return {'codec': 'libx264', 'quality': 10, 'pixelformat': 'yuv420p'}
+    elif codec_type == 'libx265_28':
+        return {'codec': 'libx265', 'pixelformat': 'yuv420p', 'output_params': ['-crf', '28', '-x265-params', 'log-level=none','-hide_banner', '-nostats']}
+    elif codec_type == 'libx265_8':
+        return {'codec': 'libx265', 'pixelformat': 'yuv420p', 'output_params': ['-crf', '8', '-x265-params', 'log-level=none','-hide_banner', '-nostats']}
+    elif codec_type == 'libx264_lossless':
+        if container == 'mkv':
+            return {'codec': 'ffv1', 'pixelformat': 'rgb24'}
+        else:  # mp4
+            return {'codec': 'libx264', 'output_params': ['-crf', '0'], 'pixelformat': 'yuv444p'}
+    else:  # libx264
+        return {'codec': 'libx264', 'pixelformat': 'yuv420p'}
+
+
+
+
+def save_image(tensor,
+                save_file,
+                nrow=8,
+                normalize=True,
+                value_range=(-1, 1),
+                quality='jpeg_95',  # 'jpeg_95', 'jpeg_85', 'jpeg_70', 'jpeg_50', 'webp_95', 'webp_85', 'webp_70', 'webp_50', 'png', 'webp_lossless'
+                retry=5):
+    """Save tensor as image with configurable format and quality."""
+    
+    # Get format and quality settings
+    format_info = _get_format_info(quality)
+    
+    # Rename file extension to match requested format
+    save_file = osp.splitext(save_file)[0] + format_info['ext']
+    
+    # Save image
+    error = None
+    for _ in range(retry):
+        try:
+            tensor = tensor.clamp(min(value_range), max(value_range))
+            
+            if format_info['use_pil']:
+                # Use PIL for WebP and advanced options
+                grid = torchvision.utils.make_grid(tensor, nrow=nrow, normalize=normalize, value_range=value_range)
+                # Convert to PIL Image
+                grid = grid.mul(255).add_(0.5).clamp_(0, 255).permute(1, 2, 0).to('cpu', torch.uint8).numpy()
+                img = Image.fromarray(grid)
+                img.save(save_file, **format_info['params'])
+            else:
+                # Use torchvision for JPEG and PNG
+                torchvision.utils.save_image(
+                    tensor, save_file, nrow=nrow, normalize=normalize, 
+                    value_range=value_range, **format_info['params']
+                )
+            break
+        except Exception as e:
+            error = e
+            continue
+    else:
+        print(f'cache_image failed, error: {error}', flush=True)
+    
+    return save_file
+
+
+def _get_format_info(quality):
+    """Get format extension and parameters."""
+    formats = {
+        # JPEG with torchvision (works)
+        'jpeg_95': {'ext': '.jpg', 'params': {'quality': 95}, 'use_pil': False},
+        'jpeg_85': {'ext': '.jpg', 'params': {'quality': 85}, 'use_pil': False},
+        'jpeg_70': {'ext': '.jpg', 'params': {'quality': 70}, 'use_pil': False},
+        'jpeg_50': {'ext': '.jpg', 'params': {'quality': 50}, 'use_pil': False},
+        
+        # PNG with torchvision
+        'png': {'ext': '.png', 'params': {}, 'use_pil': False},
+        
+        # WebP with PIL (for quality control)
+        'webp_95': {'ext': '.webp', 'params': {'quality': 95}, 'use_pil': True},
+        'webp_85': {'ext': '.webp', 'params': {'quality': 85}, 'use_pil': True},
+        'webp_70': {'ext': '.webp', 'params': {'quality': 70}, 'use_pil': True},
+        'webp_50': {'ext': '.webp', 'params': {'quality': 50}, 'use_pil': True},
+        'webp_lossless': {'ext': '.webp', 'params': {'lossless': True}, 'use_pil': True},
+    }
+    return formats.get(quality, formats['jpeg_95'])
+
+
+from PIL import Image, PngImagePlugin
+
+def _enc_uc(s):
+    try: return b"ASCII\0\0\0" + s.encode("ascii")
+    except UnicodeEncodeError: return b"UNICODE\0" + s.encode("utf-16le")
+
+def _dec_uc(b):
+    if not isinstance(b, (bytes, bytearray)):
+        try: b = bytes(b)
+        except Exception: return None
+    if b.startswith(b"ASCII\0\0\0"): return b[8:].decode("ascii", "ignore")
+    if b.startswith(b"UNICODE\0"):   return b[8:].decode("utf-16le", "ignore")
+    return b.decode("utf-8", "ignore")
+
+def save_image_metadata(image_path, metadata_dict, **save_kwargs):
+    try:
+        j = json.dumps(metadata_dict, ensure_ascii=False)
+        ext = os.path.splitext(image_path)[1].lower()
+        with Image.open(image_path) as im:
+            if ext == ".png":
+                pi = PngImagePlugin.PngInfo(); pi.add_text("comment", j)
+                im.save(image_path, pnginfo=pi, **save_kwargs); return True
+            if ext in (".jpg", ".jpeg"):
+                im.save(image_path, comment=j.encode("utf-8"), **save_kwargs); return True
+            if ext == ".webp":
+                import piexif
+                exif = {"0th":{}, "Exif":{piexif.ExifIFD.UserComment:_enc_uc(j)}, "GPS":{}, "1st":{}, "thumbnail":None}
+                im.save(image_path, format="WEBP", exif=piexif.dump(exif), **save_kwargs); return True
+            raise ValueError("Unsupported format")
+    except Exception as e:
+        print(f"Error saving metadata: {e}"); return False
+
+def read_image_metadata(image_path):
+    try:
+        ext = os.path.splitext(image_path)[1].lower()
+        with Image.open(image_path) as im:
+            if ext == ".png":
+                val = (getattr(im, "text", {}) or {}).get("comment") or im.info.get("comment")
+                return json.loads(val) if val else None
+            if ext in (".jpg", ".jpeg"):
+                val = im.info.get("comment")
+                if isinstance(val, (bytes, bytearray)): val = val.decode("utf-8", "ignore")
+                if val:
+                    try: return json.loads(val)
+                    except Exception: pass
+                exif = getattr(im, "getexif", lambda: None)()
+                if exif:
+                    uc = exif.get(37510)  # UserComment
+                    s = _dec_uc(uc) if uc else None
+                    if s:
+                        try: return json.loads(s)
+                        except Exception: pass
+                return None
+            if ext == ".webp":
+                exif_bytes = Image.open(image_path).info.get("exif")
+                if not exif_bytes: return None
+                import piexif
+                uc = piexif.load(exif_bytes).get("Exif", {}).get(piexif.ExifIFD.UserComment)
+                s = _dec_uc(uc) if uc else None
+                return json.loads(s) if s else None
+            return None
+    except Exception as e:
+        print(f"Error reading metadata: {e}"); return None
--- a/shared/utils/utils.py
+++ b/shared/utils/utils.py
@ -1,6 +1,5 @@
 # Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
 import argparse
-import binascii
 import os
 import os.path as osp
 import torchvision.transforms.functional as TF
@ -10,7 +9,6 @@ import tempfile
 import imageio
 import torch
 import decord
-import torchvision
 from PIL import Image
 import numpy as np
 from rembg import remove, new_session
@ -21,8 +19,6 @@ import tempfile
 import subprocess
 import json

-__all__ = ['cache_video', 'cache_image', 'str2bool']
-


 from PIL import Image
@ -218,84 +214,6 @@ def resize_and_remove_background(img_list, budget_width, budget_height, rm_backg
    return output_list


-def rand_name(length=8, suffix=''):
-    name = binascii.b2a_hex(os.urandom(length)).decode('utf-8')
-    if suffix:
-        if not suffix.startswith('.'):
-            suffix = '.' + suffix
-        name += suffix
-    return name
-
-
-def cache_video(tensor,
-                save_file=None,
-                fps=30,
-                suffix='.mp4',
-                nrow=8,
-                normalize=True,
-                value_range=(-1, 1),
-                retry=5):
-    # cache file
-    cache_file = osp.join('/tmp', rand_name(
-        suffix=suffix)) if save_file is None else save_file
-
-    # save to cache
-    error = None
-    for _ in range(retry):
-        try:
-            # preprocess
-            tensor = tensor.clamp(min(value_range), max(value_range))
-            tensor = torch.stack([
-                torchvision.utils.make_grid(
-                    u, nrow=nrow, normalize=normalize, value_range=value_range)
-                for u in tensor.unbind(2)
-            ],
-                                 dim=1).permute(1, 2, 3, 0)
-            tensor = (tensor * 255).type(torch.uint8).cpu()
-
-            # write video
-            writer = imageio.get_writer(
-                cache_file, fps=fps, codec='libx264', quality=8)
-            for frame in tensor.numpy():
-                writer.append_data(frame)
-            writer.close()
-            return cache_file
-        except Exception as e:
-            error = e
-            continue
-    else:
-        print(f'cache_video failed, error: {error}', flush=True)
-        return None
-
-
-def cache_image(tensor,
-                save_file,
-                nrow=8,
-                normalize=True,
-                value_range=(-1, 1),
-                retry=5):
-    # cache file
-    suffix = osp.splitext(save_file)[1]
-    if suffix.lower() not in [
-            '.jpg', '.jpeg', '.png', '.tiff', '.gif', '.webp'
-    ]:
-        suffix = '.png'
-
-    # save to cache
-    error = None
-    for _ in range(retry):
-        try:
-            tensor = tensor.clamp(min(value_range), max(value_range))
-            torchvision.utils.save_image(
-                tensor,
-                save_file,
-                nrow=nrow,
-                normalize=normalize,
-                value_range=value_range)
-            return save_file
-        except Exception as e:
-            error = e
-            continue


 def str2bool(v):
@ -435,212 +353,3 @@ def create_progress_hook(filename):
    return hook


-import tempfile, os
-import ffmpeg
-
-def extract_audio_tracks(source_video, verbose=False, query_only=False):
-    """
-    Extract all audio tracks from a source video into temporary AAC files.
-
-    Returns:
-        Tuple:
-          - List of temp file paths for extracted audio tracks
-          - List of corresponding metadata dicts:
-              {'codec', 'sample_rate', 'channels', 'duration', 'language'}
-              where 'duration' is set to container duration (for consistency).
-    """
-    probe = ffmpeg.probe(source_video)
-    audio_streams = [s for s in probe['streams'] if s['codec_type'] == 'audio']
-    container_duration = float(probe['format'].get('duration', 0.0))
-
-    if not audio_streams:
-        if query_only: return 0
-        if verbose: print(f"No audio track found in {source_video}")
-        return [], []
-
-    if query_only:
-        return len(audio_streams)
-
-    if verbose:
-        print(f"Found {len(audio_streams)} audio track(s), container duration = {container_duration:.3f}s")
-
-    file_paths = []
-    metadata = []
-
-    for i, stream in enumerate(audio_streams):
-        fd, temp_path = tempfile.mkstemp(suffix=f'_track{i}.aac', prefix='audio_')
-        os.close(fd)
-
-        file_paths.append(temp_path)
-        metadata.append({
-            'codec': stream.get('codec_name'),
-            'sample_rate': int(stream.get('sample_rate', 0)),
-            'channels': int(stream.get('channels', 0)),
-            'duration': container_duration,
-            'language': stream.get('tags', {}).get('language', None)
-        })
-
-        ffmpeg.input(source_video).output(
-            temp_path,
-            **{f'map': f'0:a:{i}', 'acodec': 'aac', 'b:a': '128k'}
-        ).overwrite_output().run(quiet=not verbose)
-
-    return file_paths, metadata
-
-
-import subprocess
-
-import subprocess
-
-def combine_and_concatenate_video_with_audio_tracks(
-    save_path_tmp, video_path,
-    source_audio_tracks, new_audio_tracks,
-    source_audio_duration, audio_sampling_rate,
-    new_audio_from_start=False,
-    source_audio_metadata=None,
-    audio_bitrate='128k',
-    audio_codec='aac',
-    verbose = False
-):
-    inputs, filters, maps, idx = ['-i', video_path], [], ['-map', '0:v'], 1
-    metadata_args = []
-    sources = source_audio_tracks or []
-    news = new_audio_tracks or []
-
-    duplicate_source = len(sources) == 1 and len(news) > 1
-    N = len(news) if source_audio_duration == 0 else max(len(sources), len(news)) or 1
-
-    for i in range(N):
-        s = (sources[i] if i < len(sources)
-             else sources[0] if duplicate_source else None)
-        n = news[i] if len(news) == N else (news[0] if news else None)
-
-        if source_audio_duration == 0:
-            if n:
-                inputs += ['-i', n]
-                filters.append(f'[{idx}:a]apad=pad_dur=100[aout{i}]')
-                idx += 1
-            else:
-                filters.append(f'anullsrc=r={audio_sampling_rate}:cl=mono,apad=pad_dur=100[aout{i}]')
-        else:
-            if s:
-                inputs += ['-i', s]
-                meta = source_audio_metadata[i] if source_audio_metadata and i < len(source_audio_metadata) else {}
-                needs_filter = (
-                    meta.get('codec') != audio_codec or
-                    meta.get('sample_rate') != audio_sampling_rate or
-                    meta.get('channels') != 1 or
-                    meta.get('duration', 0) < source_audio_duration
-                )
-                if needs_filter:
-                    filters.append(
-                        f'[{idx}:a]aresample={audio_sampling_rate},aformat=channel_layouts=mono,'
-                        f'apad=pad_dur={source_audio_duration},atrim=0:{source_audio_duration},asetpts=PTS-STARTPTS[s{i}]')
-                else:
-                    filters.append(
-                        f'[{idx}:a]apad=pad_dur={source_audio_duration},atrim=0:{source_audio_duration},asetpts=PTS-STARTPTS[s{i}]')
-                if lang := meta.get('language'):
-                    metadata_args += ['-metadata:s:a:' + str(i), f'language={lang}']
-                idx += 1
-            else:
-                filters.append(
-                    f'anullsrc=r={audio_sampling_rate}:cl=mono,atrim=0:{source_audio_duration},asetpts=PTS-STARTPTS[s{i}]')
-
-            if n:
-                inputs += ['-i', n]
-                start = '0' if new_audio_from_start else source_audio_duration
-                filters.append(
-                    f'[{idx}:a]aresample={audio_sampling_rate},aformat=channel_layouts=mono,'
-                    f'atrim=start={start},asetpts=PTS-STARTPTS[n{i}]')
-                filters.append(f'[s{i}][n{i}]concat=n=2:v=0:a=1[aout{i}]')
-                idx += 1
-            else:
-                filters.append(f'[s{i}]apad=pad_dur=100[aout{i}]')
-
-        maps += ['-map', f'[aout{i}]']
-
-    cmd = ['ffmpeg', '-y', *inputs,
-           '-filter_complex', ';'.join(filters),  # ✅ Only change made
-           *maps, *metadata_args,
-           '-c:v', 'copy',
-           '-c:a', audio_codec,
-           '-b:a', audio_bitrate,
-           '-ar', str(audio_sampling_rate),
-           '-ac', '1',
-           '-shortest', save_path_tmp]
-
-    if verbose:
-        print(f"ffmpeg command: {cmd}")
-    try:
-        subprocess.run(cmd, check=True, capture_output=True, text=True)
-    except subprocess.CalledProcessError as e:
-        raise Exception(f"FFmpeg error: {e.stderr}")
-
-
-import ffmpeg
-
-
-import subprocess
-import ffmpeg
-
-def combine_video_with_audio_tracks(target_video, audio_tracks, output_video,
-                                     audio_metadata=None, verbose=False):
-    if not audio_tracks:
-        if verbose: print("No audio tracks to combine."); return False
-
-    dur = float(next(s for s in ffmpeg.probe(target_video)['streams']
-                     if s['codec_type'] == 'video')['duration'])
-    if verbose: print(f"Video duration: {dur:.3f}s")
-
-    cmd = ['ffmpeg', '-y', '-i', target_video]
-    for path in audio_tracks:
-        cmd += ['-i', path]
-
-    cmd += ['-map', '0:v']
-    for i in range(len(audio_tracks)):
-        cmd += ['-map', f'{i+1}:a']
-
-    for i, meta in enumerate(audio_metadata or []):
-        if (lang := meta.get('language')):
-            cmd += ['-metadata:s:a:' + str(i), f'language={lang}']
-
-    cmd += ['-c:v', 'copy', '-c:a', 'copy', '-t', str(dur), output_video]
-
-    result = subprocess.run(cmd, capture_output=not verbose, text=True)
-    if result.returncode != 0:
-        raise Exception(f"FFmpeg error:\n{result.stderr}")
-    if verbose:
-        print(f"Created {output_video} with {len(audio_tracks)} audio track(s)")
-    return True
-
-
-def cleanup_temp_audio_files(audio_tracks, verbose=False):
-    """
-    Clean up temporary audio files.
-    
-    Args:
-        audio_tracks: List of audio file paths to delete
-        verbose: Enable verbose output (default: False)
-        
-    Returns:
-        Number of files successfully deleted
-    """
-    deleted_count = 0
-    
-    for audio_path in audio_tracks:
-        try:
-            if os.path.exists(audio_path):
-                os.unlink(audio_path)
-                deleted_count += 1
-                if verbose:
-                    print(f"Cleaned up {audio_path}")
-        except PermissionError:
-            print(f"Warning: Could not delete {audio_path} (file may be in use)")
-        except Exception as e:
-            print(f"Warning: Error deleting {audio_path}: {e}")
-    
-    if verbose and deleted_count > 0:
-        print(f"Successfully deleted {deleted_count} temporary audio file(s)")
-    
-    return deleted_count
-
--- a/wgp.py
+++ b/wgp.py
@ -13,11 +13,13 @@ from datetime import datetime
 import gradio as gr
 import random
 import json
+import numpy as np
 import importlib
 from shared.utils import notification_sound
 from shared.utils.loras_mutipliers import preparse_loras_multipliers, parse_loras_multipliers
-from shared.utils.utils import cache_video, convert_tensor_to_image, save_image, get_video_info, get_file_creation_date, convert_image_to_video
-from shared.utils.utils import extract_audio_tracks, combine_video_with_audio_tracks, combine_and_concatenate_video_with_audio_tracks, cleanup_temp_audio_files, calculate_new_dimensions
+from shared.utils.utils import convert_tensor_to_image, save_image, get_video_info, get_file_creation_date, convert_image_to_video, calculate_new_dimensions
+from shared.utils.audio_video import extract_audio_tracks, combine_video_with_audio_tracks, combine_and_concatenate_video_with_audio_tracks, cleanup_temp_audio_files,  save_video, save_image
+from shared.utils.audio_video import save_image_metadata, read_image_metadata
 from shared.match_archi import match_nvidia_architecture
 from shared.attention import get_attention_modes, get_supported_attention_modes
 from huggingface_hub import hf_hub_download, snapshot_download    
@ -53,7 +55,7 @@ AUTOSAVE_FILENAME = "queue.zip"
 PROMPT_VARS_MAX = 10

 target_mmgp_version = "3.5.8"
-WanGP_version = "7.76"
+WanGP_version = "7.77"
 settings_version = 2.23
 max_source_video_frames = 3000
 prompt_enhancer_image_caption_model, prompt_enhancer_image_caption_processor, prompt_enhancer_llm_model, prompt_enhancer_llm_tokenizer = None, None, None, None
@ -1712,7 +1714,8 @@ if not Path(server_config_filename).is_file():
        "transformer_types": [], 
        "transformer_quantization": "int8",
        "text_encoder_quantization" : "int8",
-        "save_path": "outputs", #os.path.join(os.getcwd(), 
+        "save_path": "outputs",  
+        "image_save_path": "outputs",  
        "compile" : "",
        "metadata_type": "metadata",
        "boost" : 1,
@ -2186,7 +2189,11 @@ if len(args.vae_config) > 0:
    vae_config = int(args.vae_config)

 reload_needed = False
-save_path = server_config.get("save_path", os.path.join(os.getcwd(), "gradio_outputs"))
+save_path = server_config.get("save_path", os.path.join(os.getcwd(), "outputs"))
+image_save_path = server_config.get("image_save_path", os.path.join(os.getcwd(), "outputs"))
+if not "video_output_codec" in server_config: server_config["video_output_codec"]= "libx264_8"
+if not "image_output_codec" in server_config: server_config["image_output_codec"]= "jpeg_95"
+
 preload_model_policy = server_config.get("preload_model_policy", []) 


@ -2699,6 +2706,7 @@ def apply_changes(  state,
                    VAE_precision_choice,
                    mixed_precision_choice,
                    save_path_choice,
+                    image_save_path_choice,
                    attention_choice,
                    compile_choice,
                    profile_choice,
@ -2718,6 +2726,9 @@ def apply_changes(  state,
                    notification_sound_volume_choice = 50,
                    max_frames_multiplier_choice = 1,
                    display_stats_choice = 0,
+                    video_output_codec_choice = None,
+                    image_output_codec_choice = None,
+                    audio_output_codec_choice = None,
                    last_resolution_choice = None,
 ):
    if args.lock_config:
@ -2730,6 +2741,7 @@ def apply_changes(  state,
        "transformer_types": transformer_types_choices, 
        "text_encoder_quantization" : text_encoder_quantization_choice,
        "save_path" : save_path_choice,
+        "image_save_path" : image_save_path_choice,
        "compile" : compile_choice,
        "profile" : profile_choice,
        "vae_config" : vae_config_choice,
@ -2751,6 +2763,9 @@ def apply_changes(  state,
        "notification_sound_volume" : notification_sound_volume_choice,
        "max_frames_multiplier" : max_frames_multiplier_choice,
        "display_stats" : display_stats_choice,
+        "video_output_codec" : video_output_codec_choice,
+        "image_output_codec" : image_output_codec_choice,
+        "audio_output_codec" : audio_output_codec_choice,
        "last_model_type" : state["model_type"],
        "last_model_per_family":  state["last_model_per_family"],
        "last_advanced_choice": state["advanced"], 
@ -2784,6 +2799,7 @@ def apply_changes(  state,
    vae_config = server_config["vae_config"]
    boost = server_config["boost"]
    save_path = server_config["save_path"]
+    image_save_path = server_config["image_save_path"]
    preload_model_policy = server_config["preload_model_policy"]
    transformer_quantization = server_config["transformer_quantization"]
    transformer_dtype_policy = server_config["transformer_dtype_policy"]
@ -2791,7 +2807,9 @@ def apply_changes(  state,
    transformer_types = server_config["transformer_types"]
    model_filename = get_model_filename(transformer_type, transformer_quantization, transformer_dtype_policy)
    state["model_filename"] = model_filename
-    if all(change in ["attention_mode", "vae_config", "boost", "save_path", "metadata_type", "clear_file_list", "fit_canvas", "depth_anything_v2_variant", "notification_sound_enabled", "notification_sound_volume", "mmaudio_enabled", "max_frames_multiplier", "display_stats"] for change in changes ):
+    if all(change in ["attention_mode", "vae_config", "boost", "save_path", "metadata_type", "clear_file_list", "fit_canvas", "depth_anything_v2_variant", 
+                      "notification_sound_enabled", "notification_sound_volume", "mmaudio_enabled", "max_frames_multiplier", "display_stats",
+                      "video_output_codec", "image_output_codec", "audio_output_codec"] for change in changes ):
        model_family = gr.Dropdown()
        model_choice = gr.Dropdown()
    else:
@ -2802,18 +2820,6 @@ def apply_changes(  state,
    mmaudio_enabled = server_config["mmaudio_enabled"] > 0
    return "<DIV ALIGN=CENTER>The new configuration has been succesfully applied</DIV>", header, model_family, model_choice, gr.Row(visible= server_config["enhancer_enabled"] == 1),  gr.Row(visible= mmaudio_enabled), gr.Column(visible= mmaudio_enabled)

-
-
-from moviepy.editor import ImageSequenceClip
-import numpy as np
-
-def save_video(final_frames, output_path, fps=24):
-    assert final_frames.ndim == 4 and final_frames.shape[3] == 3, f"invalid shape: {final_frames} (need t h w c)"
-    if final_frames.dtype != np.uint8:
-        final_frames = (final_frames * 255).astype(np.uint8)
-    ImageSequenceClip(list(final_frames), fps=fps).write_videofile(output_path, verbose= False)
-
-
 def get_gen_info(state):
    cache = state.get("gen", None)
    if cache == None:
@ -3754,7 +3760,7 @@ def edit_video(
    any_change = False
    if sample != None:
        video_path =get_available_filename(save_path, video_source, "_tmp") if any_mmaudio or has_already_audio else get_available_filename(save_path, video_source, "_post")  
-        cache_video( tensor=sample[None], save_file=video_path, fps=output_fps, nrow=1, normalize=True, value_range=(-1, 1))
+        save_video( tensor=sample[None], save_file=video_path, fps=output_fps, nrow=1, normalize=True, value_range=(-1, 1), codec_type= server_config.get("video_output_codec", None))

        if any_mmaudio or has_already_audio: tmp_path = video_path
        any_change = True
@ -3970,7 +3976,7 @@ def generate_video(
    process_map_video_guide = { "P": "pose", "D" : "depth", "S": "scribble", "E": "canny", "L": "flow", "C": "gray", "M": "inpaint", "U": "identity"}
    processes_names = { "pose": "Open Pose", "depth": "Depth Mask", "scribble" : "Shapes", "flow" : "Flow Map", "gray" : "Gray Levels", "inpaint" : "Inpaint Mask", "identity": "Identity Mask", "raw" : "Raw Format", "canny" : "Canny Edges"}

-    global wan_model, offloadobj, reload_needed, save_path
+    global wan_model, offloadobj, reload_needed
    gen = get_gen_info(state)
    torch.set_grad_enabled(False) 
    if mode.startswith("edit_"):
@ -4238,6 +4244,7 @@ def generate_video(

    torch.set_grad_enabled(False) 
    os.makedirs(save_path, exist_ok=True)
+    os.makedirs(image_save_path, exist_ok=True)
    gc.collect()
    torch.cuda.empty_cache()
    wan_model._interrupt = False
@ -4709,17 +4716,18 @@ def generate_video(
                any_mmaudio = MMAudio_setting != 0 and server_config.get("mmaudio_enabled", 0) != 0 and sample.shape[1] >=fps

                if is_image:    
-                    sample =  sample.permute(1,2,3,0)  #c f h w -> f h w c
-                    new_video_path = []
+                    image_path = os.path.join(image_save_path, file_name)
+                    sample =  sample.transpose(1,0)  #c f h w -> f c h w 
+                    new_image_path = []
                    for no, img in enumerate(sample):  
-                        img = Image.fromarray((127.5 * (img + 1.0)).cpu().byte().numpy())
-                        img_path = os.path.splitext(video_path)[0] + ("" if no==0 else f"_{no}") + ".jpg" 
-                        new_video_path.append(img_path)
-                        img.save(img_path)
-                    video_path= new_video_path
+                        img_path = os.path.splitext(image_path)[0] + ("" if no==0 else f"_{no}") + ".jpg" 
+                        new_image_path.append(save_image(img, save_file = img_path, quality = server_config.get("image_output_codec", None)))
+
+                    video_path= new_image_path
                elif len(control_audio_tracks) > 0 or len(source_audio_tracks) > 0 or output_new_audio_filepath is not None or any_mmaudio or output_new_audio_data is not None or audio_source is not None:
+                    video_path = os.path.join(save_path, file_name)
                    save_path_tmp = video_path[:-4] + "_tmp.mp4"
-                    cache_video( tensor=sample[None], save_file=save_path_tmp, fps=output_fps, nrow=1, normalize=True, value_range=(-1, 1))
+                    save_video( tensor=sample[None], save_file=save_path_tmp, fps=output_fps, nrow=1, normalize=True, value_range=(-1, 1), codec_type = server_config.get("video_output_codec", None))
                    output_new_audio_temp_filepath = None
                    new_audio_from_start =  reset_control_aligment
                    source_audio_duration = source_video_frames_count / fps
@ -4746,7 +4754,7 @@ def generate_video(
                    if output_new_audio_temp_filepath is not None: os.remove(output_new_audio_temp_filepath)

                else:
-                    cache_video( tensor=sample[None], save_file=video_path, fps=output_fps, nrow=1, normalize=True, value_range=(-1, 1))
+                    save_video( tensor=sample[None], save_file=video_path, fps=output_fps, nrow=1, normalize=True, value_range=(-1, 1),  codec_type= server_config.get("video_output_codec", None))

                end_time = time.time()

@ -4756,6 +4764,11 @@ def generate_video(
                inputs.pop("mode")
                inputs["model_type"] = model_type
                inputs["model_filename"] = original_filename
+                if is_image:
+                    inputs["image_quality"] = server_config.get("image_output_codec", None)
+                else:
+                    inputs["video_quality"] = server_config.get("video_output_codec", None)
+
                modules = get_model_recursive_prop(model_type, "modules", return_list= True)
                if len(modules) > 0 : inputs["modules"] = modules
                if len(transformer_loras_filenames) > 0:
@ -4778,8 +4791,7 @@ def generate_video(
                            json.dump(configs, f, indent=4)
                    elif metadata_choice == "metadata":
                        if is_image:
-                            with Image.open(path) as img:
-                                img.save(path, comment=json.dumps(configs))
+                            save_image_metadata(path, configs)
                        else:
                            from mutagen.mp4 import MP4
                            file = MP4(path)
@ -5690,7 +5702,7 @@ def has_video_file_extension(filename):

 def has_image_file_extension(filename):
    extension = os.path.splitext(filename)[-1]
-    return extension in [".jpeg", ".jpg", ".png", ".bmp", ".tiff"]
+    return extension in [".jpeg", ".jpg", ".png", ".webp", ".bmp", ".tiff"]

 def add_videos_to_gallery(state, input_file_list, choice, files_to_load):
    gen = get_gen_info(state)
@ -5795,7 +5807,7 @@ def use_video_settings(state, input_file_list, choice):

 def get_settings_from_file(state, file_path, allow_json, merge_with_defaults, switch_type_if_compatible):    
    configs = None
-    tags = None
+    any_image_or_video = False
    if file_path.endswith(".json") and allow_json:
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
@ -5807,22 +5819,22 @@ def get_settings_from_file(state, file_path, allow_json, merge_with_defaults, sw
        try:
            file = MP4(file_path)
            tags = file.tags['©cmt'][0] 
+            configs = json.loads(tags)
+            any_image_or_video = True
        except:
            pass
    elif has_image_file_extension(file_path):
        try:
-            with Image.open(file_path) as img:
-                tags = img.info["comment"]
+            configs = read_image_metadata(file_path)
+            any_image_or_video = True
        except:
            pass
-    if tags is not None:
+    if configs is None: return None, False
    try:
-            configs = json.loads(tags)
        if not "WanGP" in configs.get("type", ""): configs = None 
    except:
        configs = None
-    if configs == None:
-        return None, False
+        

    current_model_filename = state["model_filename"]
    current_model_type = state["model_type"]
@ -5848,7 +5860,7 @@ def get_settings_from_file(state, file_path, allow_json, merge_with_defaults, sw
        configs = defaults
    configs["model_type"] = model_type

-    return configs, tags != None
+    return configs, any_image_or_video

 def record_image_mode_tab(state, evt:gr.SelectData):
    state["image_mode_tab"] = 0 if evt.index ==0 else 1
@ -7849,10 +7861,6 @@ def generate_configuration_tab(state, blocks, header, model_family, model_choice
                    label="User Interface Theme. You will need to restart the App the see new Theme."
                )

-                save_path_choice = gr.Textbox(
-                    label="Output Folder for Generated Videos (need to restart app to be taken into account)",
-                    value=server_config.get("save_path", save_path)
-                )

            with gr.Tab("Performance"):

@ -7976,6 +7984,53 @@ def generate_configuration_tab(state, blocks, header, model_family, model_choice
                    label="MMAudio (if enabled, 10 GB of extra models will be downloaded)"
                )

+            with gr.Tab("Outputs"):
+
+                video_output_codec_choice = gr.Dropdown(
+                    choices=[
+                        ("x265 Balanced Quality (CRF 28)", 'libx265_28'),
+                        ("x264 Balanced Quality (Level 8)", 'libx264_8'),
+                        ("x265 High Quality (CRF 8)", 'libx265_8'),
+                        ("x264 High Quality (Level 10)", 'libx264_10'),
+                        ("x264 Lossless", 'libx264_lossless'),
+                    ],
+                    value=server_config.get("video_output_codec", "libx264_8"),
+                    label="Video Codec to use"
+                )
+
+                image_output_codec_choice = gr.Dropdown(
+                    choices=[
+                        ("JPEG Quality 85", 'jpeg_85'),
+                        ("WEBP Quality 85", 'webp_85'),
+                        ("JPEG Quality 95", 'jpeg_95'),
+                        ("WEBP Quality 95", 'webp_95'),
+                        ("WEBP Lossless", 'webp_lossless'),
+                        ("PNG Lossless", 'png'),
+                    ],
+                    value=server_config.get("image_output_codec", "jpeg_95"),
+                    label="Image Codec to use"
+                )
+
+                audio_output_codec_choice = gr.Dropdown(
+                    choices=[
+                        ("AAC 128 kbit", 'aac_128'),
+                    ],
+                    value=server_config.get("audio_output_codec", "aac_128"),
+                    visible = False,
+                    label="Audio Codec to use"
+                )
+
+                video_save_path_choice = gr.Textbox(
+                    label="Output Folder for Generated Videos (need to restart app to be taken into account)",
+                    value=server_config.get("save_path", save_path)
+                )
+
+                image_save_path_choice = gr.Textbox(
+                    label="Output Folder for Generated Images (need to restart app to be taken into account)",
+                    value=server_config.get("image_save_path", image_save_path)
+                )
+
+
            with gr.Tab("Notifications"):
                gr.Markdown("### Notification Settings")
                notification_sound_enabled_choice = gr.Dropdown(
@ -8008,7 +8063,8 @@ def generate_configuration_tab(state, blocks, header, model_family, model_choice
                    text_encoder_quantization_choice,
                    VAE_precision_choice,
                    mixed_precision_choice,
-                    save_path_choice,
+                    video_save_path_choice,
+                    image_save_path_choice,
                    attention_choice,
                    compile_choice,                            
                    profile_choice,
@ -8028,6 +8084,9 @@ def generate_configuration_tab(state, blocks, header, model_family, model_choice
                    notification_sound_volume_choice,
                    max_frames_multiplier_choice,
                    display_stats_choice,
+                    video_output_codec_choice,
+                    image_output_codec_choice,
+                    audio_output_codec_choice,
                    resolution,
                ],
                outputs= [msg , header, model_family, model_choice, prompt_enhancer_row, mmaudio_tab, PP_MMAudio_col]
@ -8626,7 +8685,7 @@ def create_ui():
            with gr.Tab("Guides", id="info") as info_tab:
                generate_info_tab()
            with gr.Tab("Video Mask Creator", id="video_mask_creator") as video_mask_creator:
-                matanyone_app.display(main_tabs, tab_state, video_guide, image_guide, video_mask, image_mask, image_refs)
+                matanyone_app.display(main_tabs, tab_state, server_config, video_guide, image_guide, video_mask, image_mask, image_refs)
            if not args.lock_config:
                with gr.Tab("Downloads", id="downloads") as downloads_tab:
                    generate_download_tab(lset_name, loras_choices, state)
@ -8662,5 +8721,4 @@ if __name__ == "__main__":
        else:
            url = "http://" + server_name 
        webbrowser.open(url + ":" + str(server_port), new = 0, autoraise = True)
-    demo.launch(favicon_path="favicon.png",  server_name=server_name, server_port=server_port, share=args.share, allowed_paths=[save_path])
-
+    demo.launch(favicon_path="favicon.png",  server_name=server_name, server_port=server_port, share=args.share, allowed_paths=[save_path] + [] if save_path == image_save_path else [image_save_path] )