lucky day

This commit is contained in:
deepbeepmeep 2025-08-12 00:52:22 +02:00
parent 58c1549962
commit 175e05fc1e
7 changed files with 564 additions and 356 deletions

View File

@ -20,6 +20,20 @@ WanGP supports the Wan (and derived models), Hunyuan Video and LTV Video models
**Follow DeepBeepMeep on Twitter/X to get the Latest News**: https://x.com/deepbeepmeep
## 🔥 Latest Updates :
### August 11 2025: WanGP v7.77 - Lucky Day
This is your lucky day ! thanks to new configuration options that will let you store generated Videos and Images in lossless compressed formats, you will find they in fact they look two times better without doing anything !
Just kidding, they will be only marginally better, but at least this opens the way to professionnal editing.
Support:
- Video: x264, x264 lossless, x265
- Images: jpeg, png, webp, wbp lossless
Generation Settings are stored in each of the above regardless of the format (that was the hard part).
Also you can now choose different output directories for images and videos.
### August 10 2025: WanGP v7.76 - Faster than the VAE ...
We have a funny one here today: FastWan 2.2 5B, the Fastest Video Generator, only 20s to generate 121 frames at 720p. The snag is that VAE is twice as slow...
Thanks to Kijai for extracting the Lora that is used to build the corresponding finetune.

View File

@ -131,7 +131,7 @@ from pathlib import Path
import torch
def remux_with_audio(video_path: Path, output_path: Path, audio: torch.Tensor, sampling_rate: int):
from shared.utils.utils import extract_audio_tracks, combine_video_with_audio_tracks, cleanup_temp_audio_files
from shared.utils.audio_video import extract_audio_tracks, combine_video_with_audio_tracks, cleanup_temp_audio_files
with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as f:
temp_path = Path(f.name)

View File

@ -21,6 +21,7 @@ from segment_anything.modeling.image_encoder import window_partition, window_unp
from .utils.get_default_model import get_matanyone_model
from .matanyone.inference.inference_core import InferenceCore
from .matanyone_wrapper import matanyone
from shared.utils.audio_video import save_video, save_image
arg_device = "cuda"
arg_sam_model_type="vit_h"
@ -377,14 +378,14 @@ def show_mask(video_state, interactive_state, mask_dropdown):
return select_frame
def save_video(frames, output_path, fps):
# def save_video(frames, output_path, fps):
writer = imageio.get_writer( output_path, fps=fps, codec='libx264', quality=8)
for frame in frames:
writer.append_data(frame)
writer.close()
# writer = imageio.get_writer( output_path, fps=fps, codec='libx264', quality=8)
# for frame in frames:
# writer.append_data(frame)
# writer.close()
return output_path
# return output_path
def mask_to_xyxy_box(mask):
rows, cols = np.where(mask == 255)
@ -535,20 +536,20 @@ def video_matting(video_state,video_input, end_slider, matting_type, interactive
file_name= video_state["video_name"]
file_name = ".".join(file_name.split(".")[:-1])
from shared.utils.utils import extract_audio_tracks, combine_video_with_audio_tracks, cleanup_temp_audio_files
from shared.utils.audio_video import extract_audio_tracks, combine_video_with_audio_tracks, cleanup_temp_audio_files
source_audio_tracks, audio_metadata = extract_audio_tracks(video_input)
output_fg_path = f"./mask_outputs/{file_name}_fg.mp4"
output_fg_temp_path = f"./mask_outputs/{file_name}_fg_tmp.mp4"
if len(source_audio_tracks) == 0:
foreground_output = save_video(foreground, output_path=output_fg_path , fps=fps)
foreground_output = save_video(foreground,output_fg_path , fps=fps, codec_type= video_output_codec)
else:
foreground_output_tmp = save_video(foreground, output_path=output_fg_temp_path , fps=fps)
foreground_output_tmp = save_video(foreground, output_fg_temp_path , fps=fps, codec_type= video_output_codec)
combine_video_with_audio_tracks(output_fg_temp_path, source_audio_tracks, output_fg_path, audio_metadata=audio_metadata)
cleanup_temp_audio_files(source_audio_tracks)
os.remove(foreground_output_tmp)
foreground_output = output_fg_path
alpha_output = save_video(alpha, output_path="./mask_outputs/{}_alpha.mp4".format(file_name), fps=fps)
alpha_output = save_video(alpha, "./mask_outputs/{}_alpha.mp4".format(file_name), fps=fps, codec_type= video_output_codec)
return foreground_output, alpha_output, gr.update(visible=True), gr.update(visible=True), gr.update(visible=True), gr.update(visible=True)
@ -745,8 +746,12 @@ def teleport_to_video_tab(tab_state):
return gr.Tabs(selected="video_gen")
def display(tabs, tab_state, vace_video_input, vace_image_input, vace_video_mask, vace_image_mask, vace_image_refs):
def display(tabs, tab_state, server_config, vace_video_input, vace_image_input, vace_video_mask, vace_image_mask, vace_image_refs):
# my_tab.select(fn=load_unload_models, inputs=[], outputs=[])
global image_output_codec, video_output_codec
image_output_codec = server_config.get("image_output_codec", None)
video_output_codec = server_config.get("video_output_codec", None)
media_url = "https://github.com/pq-yang/MatAnyone/releases/download/media/"

View File

@ -46,6 +46,7 @@ soundfile
ffmpeg-python
pyannote.audio
pynvml
piexif
#huggingface_hub[hf_xet] #slow down everything !!!!
# num2words
# spacy

421
shared/utils/audio_video.py Normal file
View File

@ -0,0 +1,421 @@
import subprocess
import tempfile, os
import ffmpeg
import torchvision.transforms.functional as TF
import torch.nn.functional as F
import cv2
import tempfile
import imageio
import binascii
import torchvision
import torch
from PIL import Image
import os.path as osp
import json
def rand_name(length=8, suffix=''):
name = binascii.b2a_hex(os.urandom(length)).decode('utf-8')
if suffix:
if not suffix.startswith('.'):
suffix = '.' + suffix
name += suffix
return name
def extract_audio_tracks(source_video, verbose=False, query_only=False):
"""
Extract all audio tracks from a source video into temporary AAC files.
Returns:
Tuple:
- List of temp file paths for extracted audio tracks
- List of corresponding metadata dicts:
{'codec', 'sample_rate', 'channels', 'duration', 'language'}
where 'duration' is set to container duration (for consistency).
"""
probe = ffmpeg.probe(source_video)
audio_streams = [s for s in probe['streams'] if s['codec_type'] == 'audio']
container_duration = float(probe['format'].get('duration', 0.0))
if not audio_streams:
if query_only: return 0
if verbose: print(f"No audio track found in {source_video}")
return [], []
if query_only:
return len(audio_streams)
if verbose:
print(f"Found {len(audio_streams)} audio track(s), container duration = {container_duration:.3f}s")
file_paths = []
metadata = []
for i, stream in enumerate(audio_streams):
fd, temp_path = tempfile.mkstemp(suffix=f'_track{i}.aac', prefix='audio_')
os.close(fd)
file_paths.append(temp_path)
metadata.append({
'codec': stream.get('codec_name'),
'sample_rate': int(stream.get('sample_rate', 0)),
'channels': int(stream.get('channels', 0)),
'duration': container_duration,
'language': stream.get('tags', {}).get('language', None)
})
ffmpeg.input(source_video).output(
temp_path,
**{f'map': f'0:a:{i}', 'acodec': 'aac', 'b:a': '128k'}
).overwrite_output().run(quiet=not verbose)
return file_paths, metadata
def combine_and_concatenate_video_with_audio_tracks(
save_path_tmp, video_path,
source_audio_tracks, new_audio_tracks,
source_audio_duration, audio_sampling_rate,
new_audio_from_start=False,
source_audio_metadata=None,
audio_bitrate='128k',
audio_codec='aac',
verbose = False
):
inputs, filters, maps, idx = ['-i', video_path], [], ['-map', '0:v'], 1
metadata_args = []
sources = source_audio_tracks or []
news = new_audio_tracks or []
duplicate_source = len(sources) == 1 and len(news) > 1
N = len(news) if source_audio_duration == 0 else max(len(sources), len(news)) or 1
for i in range(N):
s = (sources[i] if i < len(sources)
else sources[0] if duplicate_source else None)
n = news[i] if len(news) == N else (news[0] if news else None)
if source_audio_duration == 0:
if n:
inputs += ['-i', n]
filters.append(f'[{idx}:a]apad=pad_dur=100[aout{i}]')
idx += 1
else:
filters.append(f'anullsrc=r={audio_sampling_rate}:cl=mono,apad=pad_dur=100[aout{i}]')
else:
if s:
inputs += ['-i', s]
meta = source_audio_metadata[i] if source_audio_metadata and i < len(source_audio_metadata) else {}
needs_filter = (
meta.get('codec') != audio_codec or
meta.get('sample_rate') != audio_sampling_rate or
meta.get('channels') != 1 or
meta.get('duration', 0) < source_audio_duration
)
if needs_filter:
filters.append(
f'[{idx}:a]aresample={audio_sampling_rate},aformat=channel_layouts=mono,'
f'apad=pad_dur={source_audio_duration},atrim=0:{source_audio_duration},asetpts=PTS-STARTPTS[s{i}]')
else:
filters.append(
f'[{idx}:a]apad=pad_dur={source_audio_duration},atrim=0:{source_audio_duration},asetpts=PTS-STARTPTS[s{i}]')
if lang := meta.get('language'):
metadata_args += ['-metadata:s:a:' + str(i), f'language={lang}']
idx += 1
else:
filters.append(
f'anullsrc=r={audio_sampling_rate}:cl=mono,atrim=0:{source_audio_duration},asetpts=PTS-STARTPTS[s{i}]')
if n:
inputs += ['-i', n]
start = '0' if new_audio_from_start else source_audio_duration
filters.append(
f'[{idx}:a]aresample={audio_sampling_rate},aformat=channel_layouts=mono,'
f'atrim=start={start},asetpts=PTS-STARTPTS[n{i}]')
filters.append(f'[s{i}][n{i}]concat=n=2:v=0:a=1[aout{i}]')
idx += 1
else:
filters.append(f'[s{i}]apad=pad_dur=100[aout{i}]')
maps += ['-map', f'[aout{i}]']
cmd = ['ffmpeg', '-y', *inputs,
'-filter_complex', ';'.join(filters), # ✅ Only change made
*maps, *metadata_args,
'-c:v', 'copy',
'-c:a', audio_codec,
'-b:a', audio_bitrate,
'-ar', str(audio_sampling_rate),
'-ac', '1',
'-shortest', save_path_tmp]
if verbose:
print(f"ffmpeg command: {cmd}")
try:
subprocess.run(cmd, check=True, capture_output=True, text=True)
except subprocess.CalledProcessError as e:
raise Exception(f"FFmpeg error: {e.stderr}")
def combine_video_with_audio_tracks(target_video, audio_tracks, output_video,
audio_metadata=None, verbose=False):
if not audio_tracks:
if verbose: print("No audio tracks to combine."); return False
dur = float(next(s for s in ffmpeg.probe(target_video)['streams']
if s['codec_type'] == 'video')['duration'])
if verbose: print(f"Video duration: {dur:.3f}s")
cmd = ['ffmpeg', '-y', '-i', target_video]
for path in audio_tracks:
cmd += ['-i', path]
cmd += ['-map', '0:v']
for i in range(len(audio_tracks)):
cmd += ['-map', f'{i+1}:a']
for i, meta in enumerate(audio_metadata or []):
if (lang := meta.get('language')):
cmd += ['-metadata:s:a:' + str(i), f'language={lang}']
cmd += ['-c:v', 'copy', '-c:a', 'copy', '-t', str(dur), output_video]
result = subprocess.run(cmd, capture_output=not verbose, text=True)
if result.returncode != 0:
raise Exception(f"FFmpeg error:\n{result.stderr}")
if verbose:
print(f"Created {output_video} with {len(audio_tracks)} audio track(s)")
return True
def cleanup_temp_audio_files(audio_tracks, verbose=False):
"""
Clean up temporary audio files.
Args:
audio_tracks: List of audio file paths to delete
verbose: Enable verbose output (default: False)
Returns:
Number of files successfully deleted
"""
deleted_count = 0
for audio_path in audio_tracks:
try:
if os.path.exists(audio_path):
os.unlink(audio_path)
deleted_count += 1
if verbose:
print(f"Cleaned up {audio_path}")
except PermissionError:
print(f"Warning: Could not delete {audio_path} (file may be in use)")
except Exception as e:
print(f"Warning: Error deleting {audio_path}: {e}")
if verbose and deleted_count > 0:
print(f"Successfully deleted {deleted_count} temporary audio file(s)")
return deleted_count
def save_video(tensor,
save_file=None,
fps=30,
codec_type='libx264_8',
container='mp4',
nrow=8,
normalize=True,
value_range=(-1, 1),
retry=5):
"""Save tensor as video with configurable codec and container options."""
suffix = f'.{container}'
cache_file = osp.join('/tmp', rand_name(suffix=suffix)) if save_file is None else save_file
if not cache_file.endswith(suffix):
cache_file = osp.splitext(cache_file)[0] + suffix
# Configure codec parameters
codec_params = _get_codec_params(codec_type, container)
# Process and save
error = None
for _ in range(retry):
try:
if torch.is_tensor(tensor):
# Preprocess tensor
tensor = tensor.clamp(min(value_range), max(value_range))
tensor = torch.stack([
torchvision.utils.make_grid(u, nrow=nrow, normalize=normalize, value_range=value_range)
for u in tensor.unbind(2)
], dim=1).permute(1, 2, 3, 0)
tensor = (tensor * 255).type(torch.uint8).cpu()
arrays = tensor.numpy()
else:
arrays = tensor
# Write video (silence ffmpeg logs)
writer = imageio.get_writer(cache_file, fps=fps, ffmpeg_log_level='error', **codec_params)
for frame in arrays:
writer.append_data(frame)
writer.close()
return cache_file
except Exception as e:
error = e
print(f"error saving {save_file}: {e}")
def _get_codec_params(codec_type, container):
"""Get codec parameters based on codec type and container."""
if codec_type == 'libx264_8':
return {'codec': 'libx264', 'quality': 8, 'pixelformat': 'yuv420p'}
elif codec_type == 'libx264_10':
return {'codec': 'libx264', 'quality': 10, 'pixelformat': 'yuv420p'}
elif codec_type == 'libx265_28':
return {'codec': 'libx265', 'pixelformat': 'yuv420p', 'output_params': ['-crf', '28', '-x265-params', 'log-level=none','-hide_banner', '-nostats']}
elif codec_type == 'libx265_8':
return {'codec': 'libx265', 'pixelformat': 'yuv420p', 'output_params': ['-crf', '8', '-x265-params', 'log-level=none','-hide_banner', '-nostats']}
elif codec_type == 'libx264_lossless':
if container == 'mkv':
return {'codec': 'ffv1', 'pixelformat': 'rgb24'}
else: # mp4
return {'codec': 'libx264', 'output_params': ['-crf', '0'], 'pixelformat': 'yuv444p'}
else: # libx264
return {'codec': 'libx264', 'pixelformat': 'yuv420p'}
def save_image(tensor,
save_file,
nrow=8,
normalize=True,
value_range=(-1, 1),
quality='jpeg_95', # 'jpeg_95', 'jpeg_85', 'jpeg_70', 'jpeg_50', 'webp_95', 'webp_85', 'webp_70', 'webp_50', 'png', 'webp_lossless'
retry=5):
"""Save tensor as image with configurable format and quality."""
# Get format and quality settings
format_info = _get_format_info(quality)
# Rename file extension to match requested format
save_file = osp.splitext(save_file)[0] + format_info['ext']
# Save image
error = None
for _ in range(retry):
try:
tensor = tensor.clamp(min(value_range), max(value_range))
if format_info['use_pil']:
# Use PIL for WebP and advanced options
grid = torchvision.utils.make_grid(tensor, nrow=nrow, normalize=normalize, value_range=value_range)
# Convert to PIL Image
grid = grid.mul(255).add_(0.5).clamp_(0, 255).permute(1, 2, 0).to('cpu', torch.uint8).numpy()
img = Image.fromarray(grid)
img.save(save_file, **format_info['params'])
else:
# Use torchvision for JPEG and PNG
torchvision.utils.save_image(
tensor, save_file, nrow=nrow, normalize=normalize,
value_range=value_range, **format_info['params']
)
break
except Exception as e:
error = e
continue
else:
print(f'cache_image failed, error: {error}', flush=True)
return save_file
def _get_format_info(quality):
"""Get format extension and parameters."""
formats = {
# JPEG with torchvision (works)
'jpeg_95': {'ext': '.jpg', 'params': {'quality': 95}, 'use_pil': False},
'jpeg_85': {'ext': '.jpg', 'params': {'quality': 85}, 'use_pil': False},
'jpeg_70': {'ext': '.jpg', 'params': {'quality': 70}, 'use_pil': False},
'jpeg_50': {'ext': '.jpg', 'params': {'quality': 50}, 'use_pil': False},
# PNG with torchvision
'png': {'ext': '.png', 'params': {}, 'use_pil': False},
# WebP with PIL (for quality control)
'webp_95': {'ext': '.webp', 'params': {'quality': 95}, 'use_pil': True},
'webp_85': {'ext': '.webp', 'params': {'quality': 85}, 'use_pil': True},
'webp_70': {'ext': '.webp', 'params': {'quality': 70}, 'use_pil': True},
'webp_50': {'ext': '.webp', 'params': {'quality': 50}, 'use_pil': True},
'webp_lossless': {'ext': '.webp', 'params': {'lossless': True}, 'use_pil': True},
}
return formats.get(quality, formats['jpeg_95'])
from PIL import Image, PngImagePlugin
def _enc_uc(s):
try: return b"ASCII\0\0\0" + s.encode("ascii")
except UnicodeEncodeError: return b"UNICODE\0" + s.encode("utf-16le")
def _dec_uc(b):
if not isinstance(b, (bytes, bytearray)):
try: b = bytes(b)
except Exception: return None
if b.startswith(b"ASCII\0\0\0"): return b[8:].decode("ascii", "ignore")
if b.startswith(b"UNICODE\0"): return b[8:].decode("utf-16le", "ignore")
return b.decode("utf-8", "ignore")
def save_image_metadata(image_path, metadata_dict, **save_kwargs):
try:
j = json.dumps(metadata_dict, ensure_ascii=False)
ext = os.path.splitext(image_path)[1].lower()
with Image.open(image_path) as im:
if ext == ".png":
pi = PngImagePlugin.PngInfo(); pi.add_text("comment", j)
im.save(image_path, pnginfo=pi, **save_kwargs); return True
if ext in (".jpg", ".jpeg"):
im.save(image_path, comment=j.encode("utf-8"), **save_kwargs); return True
if ext == ".webp":
import piexif
exif = {"0th":{}, "Exif":{piexif.ExifIFD.UserComment:_enc_uc(j)}, "GPS":{}, "1st":{}, "thumbnail":None}
im.save(image_path, format="WEBP", exif=piexif.dump(exif), **save_kwargs); return True
raise ValueError("Unsupported format")
except Exception as e:
print(f"Error saving metadata: {e}"); return False
def read_image_metadata(image_path):
try:
ext = os.path.splitext(image_path)[1].lower()
with Image.open(image_path) as im:
if ext == ".png":
val = (getattr(im, "text", {}) or {}).get("comment") or im.info.get("comment")
return json.loads(val) if val else None
if ext in (".jpg", ".jpeg"):
val = im.info.get("comment")
if isinstance(val, (bytes, bytearray)): val = val.decode("utf-8", "ignore")
if val:
try: return json.loads(val)
except Exception: pass
exif = getattr(im, "getexif", lambda: None)()
if exif:
uc = exif.get(37510) # UserComment
s = _dec_uc(uc) if uc else None
if s:
try: return json.loads(s)
except Exception: pass
return None
if ext == ".webp":
exif_bytes = Image.open(image_path).info.get("exif")
if not exif_bytes: return None
import piexif
uc = piexif.load(exif_bytes).get("Exif", {}).get(piexif.ExifIFD.UserComment)
s = _dec_uc(uc) if uc else None
return json.loads(s) if s else None
return None
except Exception as e:
print(f"Error reading metadata: {e}"); return None

View File

@ -1,6 +1,5 @@
# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
import argparse
import binascii
import os
import os.path as osp
import torchvision.transforms.functional as TF
@ -10,7 +9,6 @@ import tempfile
import imageio
import torch
import decord
import torchvision
from PIL import Image
import numpy as np
from rembg import remove, new_session
@ -21,8 +19,6 @@ import tempfile
import subprocess
import json
__all__ = ['cache_video', 'cache_image', 'str2bool']
from PIL import Image
@ -218,84 +214,6 @@ def resize_and_remove_background(img_list, budget_width, budget_height, rm_backg
return output_list
def rand_name(length=8, suffix=''):
name = binascii.b2a_hex(os.urandom(length)).decode('utf-8')
if suffix:
if not suffix.startswith('.'):
suffix = '.' + suffix
name += suffix
return name
def cache_video(tensor,
save_file=None,
fps=30,
suffix='.mp4',
nrow=8,
normalize=True,
value_range=(-1, 1),
retry=5):
# cache file
cache_file = osp.join('/tmp', rand_name(
suffix=suffix)) if save_file is None else save_file
# save to cache
error = None
for _ in range(retry):
try:
# preprocess
tensor = tensor.clamp(min(value_range), max(value_range))
tensor = torch.stack([
torchvision.utils.make_grid(
u, nrow=nrow, normalize=normalize, value_range=value_range)
for u in tensor.unbind(2)
],
dim=1).permute(1, 2, 3, 0)
tensor = (tensor * 255).type(torch.uint8).cpu()
# write video
writer = imageio.get_writer(
cache_file, fps=fps, codec='libx264', quality=8)
for frame in tensor.numpy():
writer.append_data(frame)
writer.close()
return cache_file
except Exception as e:
error = e
continue
else:
print(f'cache_video failed, error: {error}', flush=True)
return None
def cache_image(tensor,
save_file,
nrow=8,
normalize=True,
value_range=(-1, 1),
retry=5):
# cache file
suffix = osp.splitext(save_file)[1]
if suffix.lower() not in [
'.jpg', '.jpeg', '.png', '.tiff', '.gif', '.webp'
]:
suffix = '.png'
# save to cache
error = None
for _ in range(retry):
try:
tensor = tensor.clamp(min(value_range), max(value_range))
torchvision.utils.save_image(
tensor,
save_file,
nrow=nrow,
normalize=normalize,
value_range=value_range)
return save_file
except Exception as e:
error = e
continue
def str2bool(v):
@ -435,212 +353,3 @@ def create_progress_hook(filename):
return hook
import tempfile, os
import ffmpeg
def extract_audio_tracks(source_video, verbose=False, query_only=False):
"""
Extract all audio tracks from a source video into temporary AAC files.
Returns:
Tuple:
- List of temp file paths for extracted audio tracks
- List of corresponding metadata dicts:
{'codec', 'sample_rate', 'channels', 'duration', 'language'}
where 'duration' is set to container duration (for consistency).
"""
probe = ffmpeg.probe(source_video)
audio_streams = [s for s in probe['streams'] if s['codec_type'] == 'audio']
container_duration = float(probe['format'].get('duration', 0.0))
if not audio_streams:
if query_only: return 0
if verbose: print(f"No audio track found in {source_video}")
return [], []
if query_only:
return len(audio_streams)
if verbose:
print(f"Found {len(audio_streams)} audio track(s), container duration = {container_duration:.3f}s")
file_paths = []
metadata = []
for i, stream in enumerate(audio_streams):
fd, temp_path = tempfile.mkstemp(suffix=f'_track{i}.aac', prefix='audio_')
os.close(fd)
file_paths.append(temp_path)
metadata.append({
'codec': stream.get('codec_name'),
'sample_rate': int(stream.get('sample_rate', 0)),
'channels': int(stream.get('channels', 0)),
'duration': container_duration,
'language': stream.get('tags', {}).get('language', None)
})
ffmpeg.input(source_video).output(
temp_path,
**{f'map': f'0:a:{i}', 'acodec': 'aac', 'b:a': '128k'}
).overwrite_output().run(quiet=not verbose)
return file_paths, metadata
import subprocess
import subprocess
def combine_and_concatenate_video_with_audio_tracks(
save_path_tmp, video_path,
source_audio_tracks, new_audio_tracks,
source_audio_duration, audio_sampling_rate,
new_audio_from_start=False,
source_audio_metadata=None,
audio_bitrate='128k',
audio_codec='aac',
verbose = False
):
inputs, filters, maps, idx = ['-i', video_path], [], ['-map', '0:v'], 1
metadata_args = []
sources = source_audio_tracks or []
news = new_audio_tracks or []
duplicate_source = len(sources) == 1 and len(news) > 1
N = len(news) if source_audio_duration == 0 else max(len(sources), len(news)) or 1
for i in range(N):
s = (sources[i] if i < len(sources)
else sources[0] if duplicate_source else None)
n = news[i] if len(news) == N else (news[0] if news else None)
if source_audio_duration == 0:
if n:
inputs += ['-i', n]
filters.append(f'[{idx}:a]apad=pad_dur=100[aout{i}]')
idx += 1
else:
filters.append(f'anullsrc=r={audio_sampling_rate}:cl=mono,apad=pad_dur=100[aout{i}]')
else:
if s:
inputs += ['-i', s]
meta = source_audio_metadata[i] if source_audio_metadata and i < len(source_audio_metadata) else {}
needs_filter = (
meta.get('codec') != audio_codec or
meta.get('sample_rate') != audio_sampling_rate or
meta.get('channels') != 1 or
meta.get('duration', 0) < source_audio_duration
)
if needs_filter:
filters.append(
f'[{idx}:a]aresample={audio_sampling_rate},aformat=channel_layouts=mono,'
f'apad=pad_dur={source_audio_duration},atrim=0:{source_audio_duration},asetpts=PTS-STARTPTS[s{i}]')
else:
filters.append(
f'[{idx}:a]apad=pad_dur={source_audio_duration},atrim=0:{source_audio_duration},asetpts=PTS-STARTPTS[s{i}]')
if lang := meta.get('language'):
metadata_args += ['-metadata:s:a:' + str(i), f'language={lang}']
idx += 1
else:
filters.append(
f'anullsrc=r={audio_sampling_rate}:cl=mono,atrim=0:{source_audio_duration},asetpts=PTS-STARTPTS[s{i}]')
if n:
inputs += ['-i', n]
start = '0' if new_audio_from_start else source_audio_duration
filters.append(
f'[{idx}:a]aresample={audio_sampling_rate},aformat=channel_layouts=mono,'
f'atrim=start={start},asetpts=PTS-STARTPTS[n{i}]')
filters.append(f'[s{i}][n{i}]concat=n=2:v=0:a=1[aout{i}]')
idx += 1
else:
filters.append(f'[s{i}]apad=pad_dur=100[aout{i}]')
maps += ['-map', f'[aout{i}]']
cmd = ['ffmpeg', '-y', *inputs,
'-filter_complex', ';'.join(filters), # ✅ Only change made
*maps, *metadata_args,
'-c:v', 'copy',
'-c:a', audio_codec,
'-b:a', audio_bitrate,
'-ar', str(audio_sampling_rate),
'-ac', '1',
'-shortest', save_path_tmp]
if verbose:
print(f"ffmpeg command: {cmd}")
try:
subprocess.run(cmd, check=True, capture_output=True, text=True)
except subprocess.CalledProcessError as e:
raise Exception(f"FFmpeg error: {e.stderr}")
import ffmpeg
import subprocess
import ffmpeg
def combine_video_with_audio_tracks(target_video, audio_tracks, output_video,
audio_metadata=None, verbose=False):
if not audio_tracks:
if verbose: print("No audio tracks to combine."); return False
dur = float(next(s for s in ffmpeg.probe(target_video)['streams']
if s['codec_type'] == 'video')['duration'])
if verbose: print(f"Video duration: {dur:.3f}s")
cmd = ['ffmpeg', '-y', '-i', target_video]
for path in audio_tracks:
cmd += ['-i', path]
cmd += ['-map', '0:v']
for i in range(len(audio_tracks)):
cmd += ['-map', f'{i+1}:a']
for i, meta in enumerate(audio_metadata or []):
if (lang := meta.get('language')):
cmd += ['-metadata:s:a:' + str(i), f'language={lang}']
cmd += ['-c:v', 'copy', '-c:a', 'copy', '-t', str(dur), output_video]
result = subprocess.run(cmd, capture_output=not verbose, text=True)
if result.returncode != 0:
raise Exception(f"FFmpeg error:\n{result.stderr}")
if verbose:
print(f"Created {output_video} with {len(audio_tracks)} audio track(s)")
return True
def cleanup_temp_audio_files(audio_tracks, verbose=False):
"""
Clean up temporary audio files.
Args:
audio_tracks: List of audio file paths to delete
verbose: Enable verbose output (default: False)
Returns:
Number of files successfully deleted
"""
deleted_count = 0
for audio_path in audio_tracks:
try:
if os.path.exists(audio_path):
os.unlink(audio_path)
deleted_count += 1
if verbose:
print(f"Cleaned up {audio_path}")
except PermissionError:
print(f"Warning: Could not delete {audio_path} (file may be in use)")
except Exception as e:
print(f"Warning: Error deleting {audio_path}: {e}")
if verbose and deleted_count > 0:
print(f"Successfully deleted {deleted_count} temporary audio file(s)")
return deleted_count

164
wgp.py
View File

@ -13,11 +13,13 @@ from datetime import datetime
import gradio as gr
import random
import json
import numpy as np
import importlib
from shared.utils import notification_sound
from shared.utils.loras_mutipliers import preparse_loras_multipliers, parse_loras_multipliers
from shared.utils.utils import cache_video, convert_tensor_to_image, save_image, get_video_info, get_file_creation_date, convert_image_to_video
from shared.utils.utils import extract_audio_tracks, combine_video_with_audio_tracks, combine_and_concatenate_video_with_audio_tracks, cleanup_temp_audio_files, calculate_new_dimensions
from shared.utils.utils import convert_tensor_to_image, save_image, get_video_info, get_file_creation_date, convert_image_to_video, calculate_new_dimensions
from shared.utils.audio_video import extract_audio_tracks, combine_video_with_audio_tracks, combine_and_concatenate_video_with_audio_tracks, cleanup_temp_audio_files, save_video, save_image
from shared.utils.audio_video import save_image_metadata, read_image_metadata
from shared.match_archi import match_nvidia_architecture
from shared.attention import get_attention_modes, get_supported_attention_modes
from huggingface_hub import hf_hub_download, snapshot_download
@ -53,7 +55,7 @@ AUTOSAVE_FILENAME = "queue.zip"
PROMPT_VARS_MAX = 10
target_mmgp_version = "3.5.8"
WanGP_version = "7.76"
WanGP_version = "7.77"
settings_version = 2.23
max_source_video_frames = 3000
prompt_enhancer_image_caption_model, prompt_enhancer_image_caption_processor, prompt_enhancer_llm_model, prompt_enhancer_llm_tokenizer = None, None, None, None
@ -1712,7 +1714,8 @@ if not Path(server_config_filename).is_file():
"transformer_types": [],
"transformer_quantization": "int8",
"text_encoder_quantization" : "int8",
"save_path": "outputs", #os.path.join(os.getcwd(),
"save_path": "outputs",
"image_save_path": "outputs",
"compile" : "",
"metadata_type": "metadata",
"boost" : 1,
@ -2186,7 +2189,11 @@ if len(args.vae_config) > 0:
vae_config = int(args.vae_config)
reload_needed = False
save_path = server_config.get("save_path", os.path.join(os.getcwd(), "gradio_outputs"))
save_path = server_config.get("save_path", os.path.join(os.getcwd(), "outputs"))
image_save_path = server_config.get("image_save_path", os.path.join(os.getcwd(), "outputs"))
if not "video_output_codec" in server_config: server_config["video_output_codec"]= "libx264_8"
if not "image_output_codec" in server_config: server_config["image_output_codec"]= "jpeg_95"
preload_model_policy = server_config.get("preload_model_policy", [])
@ -2699,6 +2706,7 @@ def apply_changes( state,
VAE_precision_choice,
mixed_precision_choice,
save_path_choice,
image_save_path_choice,
attention_choice,
compile_choice,
profile_choice,
@ -2718,6 +2726,9 @@ def apply_changes( state,
notification_sound_volume_choice = 50,
max_frames_multiplier_choice = 1,
display_stats_choice = 0,
video_output_codec_choice = None,
image_output_codec_choice = None,
audio_output_codec_choice = None,
last_resolution_choice = None,
):
if args.lock_config:
@ -2730,6 +2741,7 @@ def apply_changes( state,
"transformer_types": transformer_types_choices,
"text_encoder_quantization" : text_encoder_quantization_choice,
"save_path" : save_path_choice,
"image_save_path" : image_save_path_choice,
"compile" : compile_choice,
"profile" : profile_choice,
"vae_config" : vae_config_choice,
@ -2751,6 +2763,9 @@ def apply_changes( state,
"notification_sound_volume" : notification_sound_volume_choice,
"max_frames_multiplier" : max_frames_multiplier_choice,
"display_stats" : display_stats_choice,
"video_output_codec" : video_output_codec_choice,
"image_output_codec" : image_output_codec_choice,
"audio_output_codec" : audio_output_codec_choice,
"last_model_type" : state["model_type"],
"last_model_per_family": state["last_model_per_family"],
"last_advanced_choice": state["advanced"],
@ -2784,6 +2799,7 @@ def apply_changes( state,
vae_config = server_config["vae_config"]
boost = server_config["boost"]
save_path = server_config["save_path"]
image_save_path = server_config["image_save_path"]
preload_model_policy = server_config["preload_model_policy"]
transformer_quantization = server_config["transformer_quantization"]
transformer_dtype_policy = server_config["transformer_dtype_policy"]
@ -2791,7 +2807,9 @@ def apply_changes( state,
transformer_types = server_config["transformer_types"]
model_filename = get_model_filename(transformer_type, transformer_quantization, transformer_dtype_policy)
state["model_filename"] = model_filename
if all(change in ["attention_mode", "vae_config", "boost", "save_path", "metadata_type", "clear_file_list", "fit_canvas", "depth_anything_v2_variant", "notification_sound_enabled", "notification_sound_volume", "mmaudio_enabled", "max_frames_multiplier", "display_stats"] for change in changes ):
if all(change in ["attention_mode", "vae_config", "boost", "save_path", "metadata_type", "clear_file_list", "fit_canvas", "depth_anything_v2_variant",
"notification_sound_enabled", "notification_sound_volume", "mmaudio_enabled", "max_frames_multiplier", "display_stats",
"video_output_codec", "image_output_codec", "audio_output_codec"] for change in changes ):
model_family = gr.Dropdown()
model_choice = gr.Dropdown()
else:
@ -2802,18 +2820,6 @@ def apply_changes( state,
mmaudio_enabled = server_config["mmaudio_enabled"] > 0
return "<DIV ALIGN=CENTER>The new configuration has been succesfully applied</DIV>", header, model_family, model_choice, gr.Row(visible= server_config["enhancer_enabled"] == 1), gr.Row(visible= mmaudio_enabled), gr.Column(visible= mmaudio_enabled)
from moviepy.editor import ImageSequenceClip
import numpy as np
def save_video(final_frames, output_path, fps=24):
assert final_frames.ndim == 4 and final_frames.shape[3] == 3, f"invalid shape: {final_frames} (need t h w c)"
if final_frames.dtype != np.uint8:
final_frames = (final_frames * 255).astype(np.uint8)
ImageSequenceClip(list(final_frames), fps=fps).write_videofile(output_path, verbose= False)
def get_gen_info(state):
cache = state.get("gen", None)
if cache == None:
@ -3754,7 +3760,7 @@ def edit_video(
any_change = False
if sample != None:
video_path =get_available_filename(save_path, video_source, "_tmp") if any_mmaudio or has_already_audio else get_available_filename(save_path, video_source, "_post")
cache_video( tensor=sample[None], save_file=video_path, fps=output_fps, nrow=1, normalize=True, value_range=(-1, 1))
save_video( tensor=sample[None], save_file=video_path, fps=output_fps, nrow=1, normalize=True, value_range=(-1, 1), codec_type= server_config.get("video_output_codec", None))
if any_mmaudio or has_already_audio: tmp_path = video_path
any_change = True
@ -3970,7 +3976,7 @@ def generate_video(
process_map_video_guide = { "P": "pose", "D" : "depth", "S": "scribble", "E": "canny", "L": "flow", "C": "gray", "M": "inpaint", "U": "identity"}
processes_names = { "pose": "Open Pose", "depth": "Depth Mask", "scribble" : "Shapes", "flow" : "Flow Map", "gray" : "Gray Levels", "inpaint" : "Inpaint Mask", "identity": "Identity Mask", "raw" : "Raw Format", "canny" : "Canny Edges"}
global wan_model, offloadobj, reload_needed, save_path
global wan_model, offloadobj, reload_needed
gen = get_gen_info(state)
torch.set_grad_enabled(False)
if mode.startswith("edit_"):
@ -4238,6 +4244,7 @@ def generate_video(
torch.set_grad_enabled(False)
os.makedirs(save_path, exist_ok=True)
os.makedirs(image_save_path, exist_ok=True)
gc.collect()
torch.cuda.empty_cache()
wan_model._interrupt = False
@ -4708,18 +4715,19 @@ def generate_video(
video_path = os.path.join(save_path, file_name)
any_mmaudio = MMAudio_setting != 0 and server_config.get("mmaudio_enabled", 0) != 0 and sample.shape[1] >=fps
if is_image:
sample = sample.permute(1,2,3,0) #c f h w -> f h w c
new_video_path = []
if is_image:
image_path = os.path.join(image_save_path, file_name)
sample = sample.transpose(1,0) #c f h w -> f c h w
new_image_path = []
for no, img in enumerate(sample):
img = Image.fromarray((127.5 * (img + 1.0)).cpu().byte().numpy())
img_path = os.path.splitext(video_path)[0] + ("" if no==0 else f"_{no}") + ".jpg"
new_video_path.append(img_path)
img.save(img_path)
video_path= new_video_path
img_path = os.path.splitext(image_path)[0] + ("" if no==0 else f"_{no}") + ".jpg"
new_image_path.append(save_image(img, save_file = img_path, quality = server_config.get("image_output_codec", None)))
video_path= new_image_path
elif len(control_audio_tracks) > 0 or len(source_audio_tracks) > 0 or output_new_audio_filepath is not None or any_mmaudio or output_new_audio_data is not None or audio_source is not None:
video_path = os.path.join(save_path, file_name)
save_path_tmp = video_path[:-4] + "_tmp.mp4"
cache_video( tensor=sample[None], save_file=save_path_tmp, fps=output_fps, nrow=1, normalize=True, value_range=(-1, 1))
save_video( tensor=sample[None], save_file=save_path_tmp, fps=output_fps, nrow=1, normalize=True, value_range=(-1, 1), codec_type = server_config.get("video_output_codec", None))
output_new_audio_temp_filepath = None
new_audio_from_start = reset_control_aligment
source_audio_duration = source_video_frames_count / fps
@ -4746,7 +4754,7 @@ def generate_video(
if output_new_audio_temp_filepath is not None: os.remove(output_new_audio_temp_filepath)
else:
cache_video( tensor=sample[None], save_file=video_path, fps=output_fps, nrow=1, normalize=True, value_range=(-1, 1))
save_video( tensor=sample[None], save_file=video_path, fps=output_fps, nrow=1, normalize=True, value_range=(-1, 1), codec_type= server_config.get("video_output_codec", None))
end_time = time.time()
@ -4756,6 +4764,11 @@ def generate_video(
inputs.pop("mode")
inputs["model_type"] = model_type
inputs["model_filename"] = original_filename
if is_image:
inputs["image_quality"] = server_config.get("image_output_codec", None)
else:
inputs["video_quality"] = server_config.get("video_output_codec", None)
modules = get_model_recursive_prop(model_type, "modules", return_list= True)
if len(modules) > 0 : inputs["modules"] = modules
if len(transformer_loras_filenames) > 0:
@ -4778,8 +4791,7 @@ def generate_video(
json.dump(configs, f, indent=4)
elif metadata_choice == "metadata":
if is_image:
with Image.open(path) as img:
img.save(path, comment=json.dumps(configs))
save_image_metadata(path, configs)
else:
from mutagen.mp4 import MP4
file = MP4(path)
@ -5690,7 +5702,7 @@ def has_video_file_extension(filename):
def has_image_file_extension(filename):
extension = os.path.splitext(filename)[-1]
return extension in [".jpeg", ".jpg", ".png", ".bmp", ".tiff"]
return extension in [".jpeg", ".jpg", ".png", ".webp", ".bmp", ".tiff"]
def add_videos_to_gallery(state, input_file_list, choice, files_to_load):
gen = get_gen_info(state)
@ -5795,7 +5807,7 @@ def use_video_settings(state, input_file_list, choice):
def get_settings_from_file(state, file_path, allow_json, merge_with_defaults, switch_type_if_compatible):
configs = None
tags = None
any_image_or_video = False
if file_path.endswith(".json") and allow_json:
try:
with open(file_path, 'r', encoding='utf-8') as f:
@ -5807,22 +5819,22 @@ def get_settings_from_file(state, file_path, allow_json, merge_with_defaults, sw
try:
file = MP4(file_path)
tags = file.tags['©cmt'][0]
configs = json.loads(tags)
any_image_or_video = True
except:
pass
elif has_image_file_extension(file_path):
try:
with Image.open(file_path) as img:
tags = img.info["comment"]
configs = read_image_metadata(file_path)
any_image_or_video = True
except:
pass
if tags is not None:
try:
configs = json.loads(tags)
if not "WanGP" in configs.get("type", ""): configs = None
except:
configs = None
if configs == None:
return None, False
if configs is None: return None, False
try:
if not "WanGP" in configs.get("type", ""): configs = None
except:
configs = None
current_model_filename = state["model_filename"]
current_model_type = state["model_type"]
@ -5848,7 +5860,7 @@ def get_settings_from_file(state, file_path, allow_json, merge_with_defaults, sw
configs = defaults
configs["model_type"] = model_type
return configs, tags != None
return configs, any_image_or_video
def record_image_mode_tab(state, evt:gr.SelectData):
state["image_mode_tab"] = 0 if evt.index ==0 else 1
@ -7849,10 +7861,6 @@ def generate_configuration_tab(state, blocks, header, model_family, model_choice
label="User Interface Theme. You will need to restart the App the see new Theme."
)
save_path_choice = gr.Textbox(
label="Output Folder for Generated Videos (need to restart app to be taken into account)",
value=server_config.get("save_path", save_path)
)
with gr.Tab("Performance"):
@ -7976,6 +7984,53 @@ def generate_configuration_tab(state, blocks, header, model_family, model_choice
label="MMAudio (if enabled, 10 GB of extra models will be downloaded)"
)
with gr.Tab("Outputs"):
video_output_codec_choice = gr.Dropdown(
choices=[
("x265 Balanced Quality (CRF 28)", 'libx265_28'),
("x264 Balanced Quality (Level 8)", 'libx264_8'),
("x265 High Quality (CRF 8)", 'libx265_8'),
("x264 High Quality (Level 10)", 'libx264_10'),
("x264 Lossless", 'libx264_lossless'),
],
value=server_config.get("video_output_codec", "libx264_8"),
label="Video Codec to use"
)
image_output_codec_choice = gr.Dropdown(
choices=[
("JPEG Quality 85", 'jpeg_85'),
("WEBP Quality 85", 'webp_85'),
("JPEG Quality 95", 'jpeg_95'),
("WEBP Quality 95", 'webp_95'),
("WEBP Lossless", 'webp_lossless'),
("PNG Lossless", 'png'),
],
value=server_config.get("image_output_codec", "jpeg_95"),
label="Image Codec to use"
)
audio_output_codec_choice = gr.Dropdown(
choices=[
("AAC 128 kbit", 'aac_128'),
],
value=server_config.get("audio_output_codec", "aac_128"),
visible = False,
label="Audio Codec to use"
)
video_save_path_choice = gr.Textbox(
label="Output Folder for Generated Videos (need to restart app to be taken into account)",
value=server_config.get("save_path", save_path)
)
image_save_path_choice = gr.Textbox(
label="Output Folder for Generated Images (need to restart app to be taken into account)",
value=server_config.get("image_save_path", image_save_path)
)
with gr.Tab("Notifications"):
gr.Markdown("### Notification Settings")
notification_sound_enabled_choice = gr.Dropdown(
@ -8008,7 +8063,8 @@ def generate_configuration_tab(state, blocks, header, model_family, model_choice
text_encoder_quantization_choice,
VAE_precision_choice,
mixed_precision_choice,
save_path_choice,
video_save_path_choice,
image_save_path_choice,
attention_choice,
compile_choice,
profile_choice,
@ -8028,6 +8084,9 @@ def generate_configuration_tab(state, blocks, header, model_family, model_choice
notification_sound_volume_choice,
max_frames_multiplier_choice,
display_stats_choice,
video_output_codec_choice,
image_output_codec_choice,
audio_output_codec_choice,
resolution,
],
outputs= [msg , header, model_family, model_choice, prompt_enhancer_row, mmaudio_tab, PP_MMAudio_col]
@ -8626,7 +8685,7 @@ def create_ui():
with gr.Tab("Guides", id="info") as info_tab:
generate_info_tab()
with gr.Tab("Video Mask Creator", id="video_mask_creator") as video_mask_creator:
matanyone_app.display(main_tabs, tab_state, video_guide, image_guide, video_mask, image_mask, image_refs)
matanyone_app.display(main_tabs, tab_state, server_config, video_guide, image_guide, video_mask, image_mask, image_refs)
if not args.lock_config:
with gr.Tab("Downloads", id="downloads") as downloads_tab:
generate_download_tab(lset_name, loras_choices, state)
@ -8662,5 +8721,4 @@ if __name__ == "__main__":
else:
url = "http://" + server_name
webbrowser.open(url + ":" + str(server_port), new = 0, autoraise = True)
demo.launch(favicon_path="favicon.png", server_name=server_name, server_port=server_port, share=args.share, allowed_paths=[save_path])
demo.launch(favicon_path="favicon.png", server_name=server_name, server_port=server_port, share=args.share, allowed_paths=[save_path] + [] if save_path == image_save_path else [image_save_path] )