mirror of
https://github.com/Wan-Video/Wan2.1.git
synced 2025-11-04 14:16:57 +00:00
lucky day
This commit is contained in:
parent
58c1549962
commit
175e05fc1e
14
README.md
14
README.md
@ -20,6 +20,20 @@ WanGP supports the Wan (and derived models), Hunyuan Video and LTV Video models
|
||||
**Follow DeepBeepMeep on Twitter/X to get the Latest News**: https://x.com/deepbeepmeep
|
||||
|
||||
## 🔥 Latest Updates :
|
||||
|
||||
### August 11 2025: WanGP v7.77 - Lucky Day
|
||||
|
||||
This is your lucky day ! thanks to new configuration options that will let you store generated Videos and Images in lossless compressed formats, you will find they in fact they look two times better without doing anything !
|
||||
|
||||
Just kidding, they will be only marginally better, but at least this opens the way to professionnal editing.
|
||||
|
||||
Support:
|
||||
- Video: x264, x264 lossless, x265
|
||||
- Images: jpeg, png, webp, wbp lossless
|
||||
Generation Settings are stored in each of the above regardless of the format (that was the hard part).
|
||||
|
||||
Also you can now choose different output directories for images and videos.
|
||||
|
||||
### August 10 2025: WanGP v7.76 - Faster than the VAE ...
|
||||
We have a funny one here today: FastWan 2.2 5B, the Fastest Video Generator, only 20s to generate 121 frames at 720p. The snag is that VAE is twice as slow...
|
||||
Thanks to Kijai for extracting the Lora that is used to build the corresponding finetune.
|
||||
|
||||
@ -131,7 +131,7 @@ from pathlib import Path
|
||||
import torch
|
||||
|
||||
def remux_with_audio(video_path: Path, output_path: Path, audio: torch.Tensor, sampling_rate: int):
|
||||
from shared.utils.utils import extract_audio_tracks, combine_video_with_audio_tracks, cleanup_temp_audio_files
|
||||
from shared.utils.audio_video import extract_audio_tracks, combine_video_with_audio_tracks, cleanup_temp_audio_files
|
||||
|
||||
with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as f:
|
||||
temp_path = Path(f.name)
|
||||
|
||||
@ -21,6 +21,7 @@ from segment_anything.modeling.image_encoder import window_partition, window_unp
|
||||
from .utils.get_default_model import get_matanyone_model
|
||||
from .matanyone.inference.inference_core import InferenceCore
|
||||
from .matanyone_wrapper import matanyone
|
||||
from shared.utils.audio_video import save_video, save_image
|
||||
|
||||
arg_device = "cuda"
|
||||
arg_sam_model_type="vit_h"
|
||||
@ -377,14 +378,14 @@ def show_mask(video_state, interactive_state, mask_dropdown):
|
||||
return select_frame
|
||||
|
||||
|
||||
def save_video(frames, output_path, fps):
|
||||
# def save_video(frames, output_path, fps):
|
||||
|
||||
writer = imageio.get_writer( output_path, fps=fps, codec='libx264', quality=8)
|
||||
for frame in frames:
|
||||
writer.append_data(frame)
|
||||
writer.close()
|
||||
# writer = imageio.get_writer( output_path, fps=fps, codec='libx264', quality=8)
|
||||
# for frame in frames:
|
||||
# writer.append_data(frame)
|
||||
# writer.close()
|
||||
|
||||
return output_path
|
||||
# return output_path
|
||||
|
||||
def mask_to_xyxy_box(mask):
|
||||
rows, cols = np.where(mask == 255)
|
||||
@ -535,20 +536,20 @@ def video_matting(video_state,video_input, end_slider, matting_type, interactive
|
||||
file_name= video_state["video_name"]
|
||||
file_name = ".".join(file_name.split(".")[:-1])
|
||||
|
||||
from shared.utils.utils import extract_audio_tracks, combine_video_with_audio_tracks, cleanup_temp_audio_files
|
||||
from shared.utils.audio_video import extract_audio_tracks, combine_video_with_audio_tracks, cleanup_temp_audio_files
|
||||
source_audio_tracks, audio_metadata = extract_audio_tracks(video_input)
|
||||
output_fg_path = f"./mask_outputs/{file_name}_fg.mp4"
|
||||
output_fg_temp_path = f"./mask_outputs/{file_name}_fg_tmp.mp4"
|
||||
if len(source_audio_tracks) == 0:
|
||||
foreground_output = save_video(foreground, output_path=output_fg_path , fps=fps)
|
||||
foreground_output = save_video(foreground,output_fg_path , fps=fps, codec_type= video_output_codec)
|
||||
else:
|
||||
foreground_output_tmp = save_video(foreground, output_path=output_fg_temp_path , fps=fps)
|
||||
foreground_output_tmp = save_video(foreground, output_fg_temp_path , fps=fps, codec_type= video_output_codec)
|
||||
combine_video_with_audio_tracks(output_fg_temp_path, source_audio_tracks, output_fg_path, audio_metadata=audio_metadata)
|
||||
cleanup_temp_audio_files(source_audio_tracks)
|
||||
os.remove(foreground_output_tmp)
|
||||
foreground_output = output_fg_path
|
||||
|
||||
alpha_output = save_video(alpha, output_path="./mask_outputs/{}_alpha.mp4".format(file_name), fps=fps)
|
||||
alpha_output = save_video(alpha, "./mask_outputs/{}_alpha.mp4".format(file_name), fps=fps, codec_type= video_output_codec)
|
||||
|
||||
return foreground_output, alpha_output, gr.update(visible=True), gr.update(visible=True), gr.update(visible=True), gr.update(visible=True)
|
||||
|
||||
@ -745,8 +746,12 @@ def teleport_to_video_tab(tab_state):
|
||||
return gr.Tabs(selected="video_gen")
|
||||
|
||||
|
||||
def display(tabs, tab_state, vace_video_input, vace_image_input, vace_video_mask, vace_image_mask, vace_image_refs):
|
||||
def display(tabs, tab_state, server_config, vace_video_input, vace_image_input, vace_video_mask, vace_image_mask, vace_image_refs):
|
||||
# my_tab.select(fn=load_unload_models, inputs=[], outputs=[])
|
||||
global image_output_codec, video_output_codec
|
||||
|
||||
image_output_codec = server_config.get("image_output_codec", None)
|
||||
video_output_codec = server_config.get("video_output_codec", None)
|
||||
|
||||
media_url = "https://github.com/pq-yang/MatAnyone/releases/download/media/"
|
||||
|
||||
|
||||
@ -46,6 +46,7 @@ soundfile
|
||||
ffmpeg-python
|
||||
pyannote.audio
|
||||
pynvml
|
||||
piexif
|
||||
#huggingface_hub[hf_xet] #slow down everything !!!!
|
||||
# num2words
|
||||
# spacy
|
||||
421
shared/utils/audio_video.py
Normal file
421
shared/utils/audio_video.py
Normal file
@ -0,0 +1,421 @@
|
||||
import subprocess
|
||||
import tempfile, os
|
||||
import ffmpeg
|
||||
import torchvision.transforms.functional as TF
|
||||
import torch.nn.functional as F
|
||||
import cv2
|
||||
import tempfile
|
||||
import imageio
|
||||
import binascii
|
||||
import torchvision
|
||||
import torch
|
||||
from PIL import Image
|
||||
import os.path as osp
|
||||
import json
|
||||
|
||||
def rand_name(length=8, suffix=''):
|
||||
name = binascii.b2a_hex(os.urandom(length)).decode('utf-8')
|
||||
if suffix:
|
||||
if not suffix.startswith('.'):
|
||||
suffix = '.' + suffix
|
||||
name += suffix
|
||||
return name
|
||||
|
||||
|
||||
|
||||
def extract_audio_tracks(source_video, verbose=False, query_only=False):
|
||||
"""
|
||||
Extract all audio tracks from a source video into temporary AAC files.
|
||||
|
||||
Returns:
|
||||
Tuple:
|
||||
- List of temp file paths for extracted audio tracks
|
||||
- List of corresponding metadata dicts:
|
||||
{'codec', 'sample_rate', 'channels', 'duration', 'language'}
|
||||
where 'duration' is set to container duration (for consistency).
|
||||
"""
|
||||
probe = ffmpeg.probe(source_video)
|
||||
audio_streams = [s for s in probe['streams'] if s['codec_type'] == 'audio']
|
||||
container_duration = float(probe['format'].get('duration', 0.0))
|
||||
|
||||
if not audio_streams:
|
||||
if query_only: return 0
|
||||
if verbose: print(f"No audio track found in {source_video}")
|
||||
return [], []
|
||||
|
||||
if query_only:
|
||||
return len(audio_streams)
|
||||
|
||||
if verbose:
|
||||
print(f"Found {len(audio_streams)} audio track(s), container duration = {container_duration:.3f}s")
|
||||
|
||||
file_paths = []
|
||||
metadata = []
|
||||
|
||||
for i, stream in enumerate(audio_streams):
|
||||
fd, temp_path = tempfile.mkstemp(suffix=f'_track{i}.aac', prefix='audio_')
|
||||
os.close(fd)
|
||||
|
||||
file_paths.append(temp_path)
|
||||
metadata.append({
|
||||
'codec': stream.get('codec_name'),
|
||||
'sample_rate': int(stream.get('sample_rate', 0)),
|
||||
'channels': int(stream.get('channels', 0)),
|
||||
'duration': container_duration,
|
||||
'language': stream.get('tags', {}).get('language', None)
|
||||
})
|
||||
|
||||
ffmpeg.input(source_video).output(
|
||||
temp_path,
|
||||
**{f'map': f'0:a:{i}', 'acodec': 'aac', 'b:a': '128k'}
|
||||
).overwrite_output().run(quiet=not verbose)
|
||||
|
||||
return file_paths, metadata
|
||||
|
||||
|
||||
|
||||
def combine_and_concatenate_video_with_audio_tracks(
|
||||
save_path_tmp, video_path,
|
||||
source_audio_tracks, new_audio_tracks,
|
||||
source_audio_duration, audio_sampling_rate,
|
||||
new_audio_from_start=False,
|
||||
source_audio_metadata=None,
|
||||
audio_bitrate='128k',
|
||||
audio_codec='aac',
|
||||
verbose = False
|
||||
):
|
||||
inputs, filters, maps, idx = ['-i', video_path], [], ['-map', '0:v'], 1
|
||||
metadata_args = []
|
||||
sources = source_audio_tracks or []
|
||||
news = new_audio_tracks or []
|
||||
|
||||
duplicate_source = len(sources) == 1 and len(news) > 1
|
||||
N = len(news) if source_audio_duration == 0 else max(len(sources), len(news)) or 1
|
||||
|
||||
for i in range(N):
|
||||
s = (sources[i] if i < len(sources)
|
||||
else sources[0] if duplicate_source else None)
|
||||
n = news[i] if len(news) == N else (news[0] if news else None)
|
||||
|
||||
if source_audio_duration == 0:
|
||||
if n:
|
||||
inputs += ['-i', n]
|
||||
filters.append(f'[{idx}:a]apad=pad_dur=100[aout{i}]')
|
||||
idx += 1
|
||||
else:
|
||||
filters.append(f'anullsrc=r={audio_sampling_rate}:cl=mono,apad=pad_dur=100[aout{i}]')
|
||||
else:
|
||||
if s:
|
||||
inputs += ['-i', s]
|
||||
meta = source_audio_metadata[i] if source_audio_metadata and i < len(source_audio_metadata) else {}
|
||||
needs_filter = (
|
||||
meta.get('codec') != audio_codec or
|
||||
meta.get('sample_rate') != audio_sampling_rate or
|
||||
meta.get('channels') != 1 or
|
||||
meta.get('duration', 0) < source_audio_duration
|
||||
)
|
||||
if needs_filter:
|
||||
filters.append(
|
||||
f'[{idx}:a]aresample={audio_sampling_rate},aformat=channel_layouts=mono,'
|
||||
f'apad=pad_dur={source_audio_duration},atrim=0:{source_audio_duration},asetpts=PTS-STARTPTS[s{i}]')
|
||||
else:
|
||||
filters.append(
|
||||
f'[{idx}:a]apad=pad_dur={source_audio_duration},atrim=0:{source_audio_duration},asetpts=PTS-STARTPTS[s{i}]')
|
||||
if lang := meta.get('language'):
|
||||
metadata_args += ['-metadata:s:a:' + str(i), f'language={lang}']
|
||||
idx += 1
|
||||
else:
|
||||
filters.append(
|
||||
f'anullsrc=r={audio_sampling_rate}:cl=mono,atrim=0:{source_audio_duration},asetpts=PTS-STARTPTS[s{i}]')
|
||||
|
||||
if n:
|
||||
inputs += ['-i', n]
|
||||
start = '0' if new_audio_from_start else source_audio_duration
|
||||
filters.append(
|
||||
f'[{idx}:a]aresample={audio_sampling_rate},aformat=channel_layouts=mono,'
|
||||
f'atrim=start={start},asetpts=PTS-STARTPTS[n{i}]')
|
||||
filters.append(f'[s{i}][n{i}]concat=n=2:v=0:a=1[aout{i}]')
|
||||
idx += 1
|
||||
else:
|
||||
filters.append(f'[s{i}]apad=pad_dur=100[aout{i}]')
|
||||
|
||||
maps += ['-map', f'[aout{i}]']
|
||||
|
||||
cmd = ['ffmpeg', '-y', *inputs,
|
||||
'-filter_complex', ';'.join(filters), # ✅ Only change made
|
||||
*maps, *metadata_args,
|
||||
'-c:v', 'copy',
|
||||
'-c:a', audio_codec,
|
||||
'-b:a', audio_bitrate,
|
||||
'-ar', str(audio_sampling_rate),
|
||||
'-ac', '1',
|
||||
'-shortest', save_path_tmp]
|
||||
|
||||
if verbose:
|
||||
print(f"ffmpeg command: {cmd}")
|
||||
try:
|
||||
subprocess.run(cmd, check=True, capture_output=True, text=True)
|
||||
except subprocess.CalledProcessError as e:
|
||||
raise Exception(f"FFmpeg error: {e.stderr}")
|
||||
|
||||
|
||||
def combine_video_with_audio_tracks(target_video, audio_tracks, output_video,
|
||||
audio_metadata=None, verbose=False):
|
||||
if not audio_tracks:
|
||||
if verbose: print("No audio tracks to combine."); return False
|
||||
|
||||
dur = float(next(s for s in ffmpeg.probe(target_video)['streams']
|
||||
if s['codec_type'] == 'video')['duration'])
|
||||
if verbose: print(f"Video duration: {dur:.3f}s")
|
||||
|
||||
cmd = ['ffmpeg', '-y', '-i', target_video]
|
||||
for path in audio_tracks:
|
||||
cmd += ['-i', path]
|
||||
|
||||
cmd += ['-map', '0:v']
|
||||
for i in range(len(audio_tracks)):
|
||||
cmd += ['-map', f'{i+1}:a']
|
||||
|
||||
for i, meta in enumerate(audio_metadata or []):
|
||||
if (lang := meta.get('language')):
|
||||
cmd += ['-metadata:s:a:' + str(i), f'language={lang}']
|
||||
|
||||
cmd += ['-c:v', 'copy', '-c:a', 'copy', '-t', str(dur), output_video]
|
||||
|
||||
result = subprocess.run(cmd, capture_output=not verbose, text=True)
|
||||
if result.returncode != 0:
|
||||
raise Exception(f"FFmpeg error:\n{result.stderr}")
|
||||
if verbose:
|
||||
print(f"Created {output_video} with {len(audio_tracks)} audio track(s)")
|
||||
return True
|
||||
|
||||
|
||||
def cleanup_temp_audio_files(audio_tracks, verbose=False):
|
||||
"""
|
||||
Clean up temporary audio files.
|
||||
|
||||
Args:
|
||||
audio_tracks: List of audio file paths to delete
|
||||
verbose: Enable verbose output (default: False)
|
||||
|
||||
Returns:
|
||||
Number of files successfully deleted
|
||||
"""
|
||||
deleted_count = 0
|
||||
|
||||
for audio_path in audio_tracks:
|
||||
try:
|
||||
if os.path.exists(audio_path):
|
||||
os.unlink(audio_path)
|
||||
deleted_count += 1
|
||||
if verbose:
|
||||
print(f"Cleaned up {audio_path}")
|
||||
except PermissionError:
|
||||
print(f"Warning: Could not delete {audio_path} (file may be in use)")
|
||||
except Exception as e:
|
||||
print(f"Warning: Error deleting {audio_path}: {e}")
|
||||
|
||||
if verbose and deleted_count > 0:
|
||||
print(f"Successfully deleted {deleted_count} temporary audio file(s)")
|
||||
|
||||
return deleted_count
|
||||
|
||||
|
||||
def save_video(tensor,
|
||||
save_file=None,
|
||||
fps=30,
|
||||
codec_type='libx264_8',
|
||||
container='mp4',
|
||||
nrow=8,
|
||||
normalize=True,
|
||||
value_range=(-1, 1),
|
||||
retry=5):
|
||||
"""Save tensor as video with configurable codec and container options."""
|
||||
|
||||
suffix = f'.{container}'
|
||||
cache_file = osp.join('/tmp', rand_name(suffix=suffix)) if save_file is None else save_file
|
||||
if not cache_file.endswith(suffix):
|
||||
cache_file = osp.splitext(cache_file)[0] + suffix
|
||||
|
||||
# Configure codec parameters
|
||||
codec_params = _get_codec_params(codec_type, container)
|
||||
|
||||
# Process and save
|
||||
error = None
|
||||
for _ in range(retry):
|
||||
try:
|
||||
if torch.is_tensor(tensor):
|
||||
# Preprocess tensor
|
||||
tensor = tensor.clamp(min(value_range), max(value_range))
|
||||
tensor = torch.stack([
|
||||
torchvision.utils.make_grid(u, nrow=nrow, normalize=normalize, value_range=value_range)
|
||||
for u in tensor.unbind(2)
|
||||
], dim=1).permute(1, 2, 3, 0)
|
||||
tensor = (tensor * 255).type(torch.uint8).cpu()
|
||||
arrays = tensor.numpy()
|
||||
else:
|
||||
arrays = tensor
|
||||
|
||||
# Write video (silence ffmpeg logs)
|
||||
writer = imageio.get_writer(cache_file, fps=fps, ffmpeg_log_level='error', **codec_params)
|
||||
for frame in arrays:
|
||||
writer.append_data(frame)
|
||||
|
||||
writer.close()
|
||||
return cache_file
|
||||
|
||||
except Exception as e:
|
||||
error = e
|
||||
print(f"error saving {save_file}: {e}")
|
||||
|
||||
|
||||
def _get_codec_params(codec_type, container):
|
||||
"""Get codec parameters based on codec type and container."""
|
||||
if codec_type == 'libx264_8':
|
||||
return {'codec': 'libx264', 'quality': 8, 'pixelformat': 'yuv420p'}
|
||||
elif codec_type == 'libx264_10':
|
||||
return {'codec': 'libx264', 'quality': 10, 'pixelformat': 'yuv420p'}
|
||||
elif codec_type == 'libx265_28':
|
||||
return {'codec': 'libx265', 'pixelformat': 'yuv420p', 'output_params': ['-crf', '28', '-x265-params', 'log-level=none','-hide_banner', '-nostats']}
|
||||
elif codec_type == 'libx265_8':
|
||||
return {'codec': 'libx265', 'pixelformat': 'yuv420p', 'output_params': ['-crf', '8', '-x265-params', 'log-level=none','-hide_banner', '-nostats']}
|
||||
elif codec_type == 'libx264_lossless':
|
||||
if container == 'mkv':
|
||||
return {'codec': 'ffv1', 'pixelformat': 'rgb24'}
|
||||
else: # mp4
|
||||
return {'codec': 'libx264', 'output_params': ['-crf', '0'], 'pixelformat': 'yuv444p'}
|
||||
else: # libx264
|
||||
return {'codec': 'libx264', 'pixelformat': 'yuv420p'}
|
||||
|
||||
|
||||
|
||||
|
||||
def save_image(tensor,
|
||||
save_file,
|
||||
nrow=8,
|
||||
normalize=True,
|
||||
value_range=(-1, 1),
|
||||
quality='jpeg_95', # 'jpeg_95', 'jpeg_85', 'jpeg_70', 'jpeg_50', 'webp_95', 'webp_85', 'webp_70', 'webp_50', 'png', 'webp_lossless'
|
||||
retry=5):
|
||||
"""Save tensor as image with configurable format and quality."""
|
||||
|
||||
# Get format and quality settings
|
||||
format_info = _get_format_info(quality)
|
||||
|
||||
# Rename file extension to match requested format
|
||||
save_file = osp.splitext(save_file)[0] + format_info['ext']
|
||||
|
||||
# Save image
|
||||
error = None
|
||||
for _ in range(retry):
|
||||
try:
|
||||
tensor = tensor.clamp(min(value_range), max(value_range))
|
||||
|
||||
if format_info['use_pil']:
|
||||
# Use PIL for WebP and advanced options
|
||||
grid = torchvision.utils.make_grid(tensor, nrow=nrow, normalize=normalize, value_range=value_range)
|
||||
# Convert to PIL Image
|
||||
grid = grid.mul(255).add_(0.5).clamp_(0, 255).permute(1, 2, 0).to('cpu', torch.uint8).numpy()
|
||||
img = Image.fromarray(grid)
|
||||
img.save(save_file, **format_info['params'])
|
||||
else:
|
||||
# Use torchvision for JPEG and PNG
|
||||
torchvision.utils.save_image(
|
||||
tensor, save_file, nrow=nrow, normalize=normalize,
|
||||
value_range=value_range, **format_info['params']
|
||||
)
|
||||
break
|
||||
except Exception as e:
|
||||
error = e
|
||||
continue
|
||||
else:
|
||||
print(f'cache_image failed, error: {error}', flush=True)
|
||||
|
||||
return save_file
|
||||
|
||||
|
||||
def _get_format_info(quality):
|
||||
"""Get format extension and parameters."""
|
||||
formats = {
|
||||
# JPEG with torchvision (works)
|
||||
'jpeg_95': {'ext': '.jpg', 'params': {'quality': 95}, 'use_pil': False},
|
||||
'jpeg_85': {'ext': '.jpg', 'params': {'quality': 85}, 'use_pil': False},
|
||||
'jpeg_70': {'ext': '.jpg', 'params': {'quality': 70}, 'use_pil': False},
|
||||
'jpeg_50': {'ext': '.jpg', 'params': {'quality': 50}, 'use_pil': False},
|
||||
|
||||
# PNG with torchvision
|
||||
'png': {'ext': '.png', 'params': {}, 'use_pil': False},
|
||||
|
||||
# WebP with PIL (for quality control)
|
||||
'webp_95': {'ext': '.webp', 'params': {'quality': 95}, 'use_pil': True},
|
||||
'webp_85': {'ext': '.webp', 'params': {'quality': 85}, 'use_pil': True},
|
||||
'webp_70': {'ext': '.webp', 'params': {'quality': 70}, 'use_pil': True},
|
||||
'webp_50': {'ext': '.webp', 'params': {'quality': 50}, 'use_pil': True},
|
||||
'webp_lossless': {'ext': '.webp', 'params': {'lossless': True}, 'use_pil': True},
|
||||
}
|
||||
return formats.get(quality, formats['jpeg_95'])
|
||||
|
||||
|
||||
from PIL import Image, PngImagePlugin
|
||||
|
||||
def _enc_uc(s):
|
||||
try: return b"ASCII\0\0\0" + s.encode("ascii")
|
||||
except UnicodeEncodeError: return b"UNICODE\0" + s.encode("utf-16le")
|
||||
|
||||
def _dec_uc(b):
|
||||
if not isinstance(b, (bytes, bytearray)):
|
||||
try: b = bytes(b)
|
||||
except Exception: return None
|
||||
if b.startswith(b"ASCII\0\0\0"): return b[8:].decode("ascii", "ignore")
|
||||
if b.startswith(b"UNICODE\0"): return b[8:].decode("utf-16le", "ignore")
|
||||
return b.decode("utf-8", "ignore")
|
||||
|
||||
def save_image_metadata(image_path, metadata_dict, **save_kwargs):
|
||||
try:
|
||||
j = json.dumps(metadata_dict, ensure_ascii=False)
|
||||
ext = os.path.splitext(image_path)[1].lower()
|
||||
with Image.open(image_path) as im:
|
||||
if ext == ".png":
|
||||
pi = PngImagePlugin.PngInfo(); pi.add_text("comment", j)
|
||||
im.save(image_path, pnginfo=pi, **save_kwargs); return True
|
||||
if ext in (".jpg", ".jpeg"):
|
||||
im.save(image_path, comment=j.encode("utf-8"), **save_kwargs); return True
|
||||
if ext == ".webp":
|
||||
import piexif
|
||||
exif = {"0th":{}, "Exif":{piexif.ExifIFD.UserComment:_enc_uc(j)}, "GPS":{}, "1st":{}, "thumbnail":None}
|
||||
im.save(image_path, format="WEBP", exif=piexif.dump(exif), **save_kwargs); return True
|
||||
raise ValueError("Unsupported format")
|
||||
except Exception as e:
|
||||
print(f"Error saving metadata: {e}"); return False
|
||||
|
||||
def read_image_metadata(image_path):
|
||||
try:
|
||||
ext = os.path.splitext(image_path)[1].lower()
|
||||
with Image.open(image_path) as im:
|
||||
if ext == ".png":
|
||||
val = (getattr(im, "text", {}) or {}).get("comment") or im.info.get("comment")
|
||||
return json.loads(val) if val else None
|
||||
if ext in (".jpg", ".jpeg"):
|
||||
val = im.info.get("comment")
|
||||
if isinstance(val, (bytes, bytearray)): val = val.decode("utf-8", "ignore")
|
||||
if val:
|
||||
try: return json.loads(val)
|
||||
except Exception: pass
|
||||
exif = getattr(im, "getexif", lambda: None)()
|
||||
if exif:
|
||||
uc = exif.get(37510) # UserComment
|
||||
s = _dec_uc(uc) if uc else None
|
||||
if s:
|
||||
try: return json.loads(s)
|
||||
except Exception: pass
|
||||
return None
|
||||
if ext == ".webp":
|
||||
exif_bytes = Image.open(image_path).info.get("exif")
|
||||
if not exif_bytes: return None
|
||||
import piexif
|
||||
uc = piexif.load(exif_bytes).get("Exif", {}).get(piexif.ExifIFD.UserComment)
|
||||
s = _dec_uc(uc) if uc else None
|
||||
return json.loads(s) if s else None
|
||||
return None
|
||||
except Exception as e:
|
||||
print(f"Error reading metadata: {e}"); return None
|
||||
@ -1,6 +1,5 @@
|
||||
# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
|
||||
import argparse
|
||||
import binascii
|
||||
import os
|
||||
import os.path as osp
|
||||
import torchvision.transforms.functional as TF
|
||||
@ -10,7 +9,6 @@ import tempfile
|
||||
import imageio
|
||||
import torch
|
||||
import decord
|
||||
import torchvision
|
||||
from PIL import Image
|
||||
import numpy as np
|
||||
from rembg import remove, new_session
|
||||
@ -21,8 +19,6 @@ import tempfile
|
||||
import subprocess
|
||||
import json
|
||||
|
||||
__all__ = ['cache_video', 'cache_image', 'str2bool']
|
||||
|
||||
|
||||
|
||||
from PIL import Image
|
||||
@ -218,84 +214,6 @@ def resize_and_remove_background(img_list, budget_width, budget_height, rm_backg
|
||||
return output_list
|
||||
|
||||
|
||||
def rand_name(length=8, suffix=''):
|
||||
name = binascii.b2a_hex(os.urandom(length)).decode('utf-8')
|
||||
if suffix:
|
||||
if not suffix.startswith('.'):
|
||||
suffix = '.' + suffix
|
||||
name += suffix
|
||||
return name
|
||||
|
||||
|
||||
def cache_video(tensor,
|
||||
save_file=None,
|
||||
fps=30,
|
||||
suffix='.mp4',
|
||||
nrow=8,
|
||||
normalize=True,
|
||||
value_range=(-1, 1),
|
||||
retry=5):
|
||||
# cache file
|
||||
cache_file = osp.join('/tmp', rand_name(
|
||||
suffix=suffix)) if save_file is None else save_file
|
||||
|
||||
# save to cache
|
||||
error = None
|
||||
for _ in range(retry):
|
||||
try:
|
||||
# preprocess
|
||||
tensor = tensor.clamp(min(value_range), max(value_range))
|
||||
tensor = torch.stack([
|
||||
torchvision.utils.make_grid(
|
||||
u, nrow=nrow, normalize=normalize, value_range=value_range)
|
||||
for u in tensor.unbind(2)
|
||||
],
|
||||
dim=1).permute(1, 2, 3, 0)
|
||||
tensor = (tensor * 255).type(torch.uint8).cpu()
|
||||
|
||||
# write video
|
||||
writer = imageio.get_writer(
|
||||
cache_file, fps=fps, codec='libx264', quality=8)
|
||||
for frame in tensor.numpy():
|
||||
writer.append_data(frame)
|
||||
writer.close()
|
||||
return cache_file
|
||||
except Exception as e:
|
||||
error = e
|
||||
continue
|
||||
else:
|
||||
print(f'cache_video failed, error: {error}', flush=True)
|
||||
return None
|
||||
|
||||
|
||||
def cache_image(tensor,
|
||||
save_file,
|
||||
nrow=8,
|
||||
normalize=True,
|
||||
value_range=(-1, 1),
|
||||
retry=5):
|
||||
# cache file
|
||||
suffix = osp.splitext(save_file)[1]
|
||||
if suffix.lower() not in [
|
||||
'.jpg', '.jpeg', '.png', '.tiff', '.gif', '.webp'
|
||||
]:
|
||||
suffix = '.png'
|
||||
|
||||
# save to cache
|
||||
error = None
|
||||
for _ in range(retry):
|
||||
try:
|
||||
tensor = tensor.clamp(min(value_range), max(value_range))
|
||||
torchvision.utils.save_image(
|
||||
tensor,
|
||||
save_file,
|
||||
nrow=nrow,
|
||||
normalize=normalize,
|
||||
value_range=value_range)
|
||||
return save_file
|
||||
except Exception as e:
|
||||
error = e
|
||||
continue
|
||||
|
||||
|
||||
def str2bool(v):
|
||||
@ -435,212 +353,3 @@ def create_progress_hook(filename):
|
||||
return hook
|
||||
|
||||
|
||||
import tempfile, os
|
||||
import ffmpeg
|
||||
|
||||
def extract_audio_tracks(source_video, verbose=False, query_only=False):
|
||||
"""
|
||||
Extract all audio tracks from a source video into temporary AAC files.
|
||||
|
||||
Returns:
|
||||
Tuple:
|
||||
- List of temp file paths for extracted audio tracks
|
||||
- List of corresponding metadata dicts:
|
||||
{'codec', 'sample_rate', 'channels', 'duration', 'language'}
|
||||
where 'duration' is set to container duration (for consistency).
|
||||
"""
|
||||
probe = ffmpeg.probe(source_video)
|
||||
audio_streams = [s for s in probe['streams'] if s['codec_type'] == 'audio']
|
||||
container_duration = float(probe['format'].get('duration', 0.0))
|
||||
|
||||
if not audio_streams:
|
||||
if query_only: return 0
|
||||
if verbose: print(f"No audio track found in {source_video}")
|
||||
return [], []
|
||||
|
||||
if query_only:
|
||||
return len(audio_streams)
|
||||
|
||||
if verbose:
|
||||
print(f"Found {len(audio_streams)} audio track(s), container duration = {container_duration:.3f}s")
|
||||
|
||||
file_paths = []
|
||||
metadata = []
|
||||
|
||||
for i, stream in enumerate(audio_streams):
|
||||
fd, temp_path = tempfile.mkstemp(suffix=f'_track{i}.aac', prefix='audio_')
|
||||
os.close(fd)
|
||||
|
||||
file_paths.append(temp_path)
|
||||
metadata.append({
|
||||
'codec': stream.get('codec_name'),
|
||||
'sample_rate': int(stream.get('sample_rate', 0)),
|
||||
'channels': int(stream.get('channels', 0)),
|
||||
'duration': container_duration,
|
||||
'language': stream.get('tags', {}).get('language', None)
|
||||
})
|
||||
|
||||
ffmpeg.input(source_video).output(
|
||||
temp_path,
|
||||
**{f'map': f'0:a:{i}', 'acodec': 'aac', 'b:a': '128k'}
|
||||
).overwrite_output().run(quiet=not verbose)
|
||||
|
||||
return file_paths, metadata
|
||||
|
||||
|
||||
import subprocess
|
||||
|
||||
import subprocess
|
||||
|
||||
def combine_and_concatenate_video_with_audio_tracks(
|
||||
save_path_tmp, video_path,
|
||||
source_audio_tracks, new_audio_tracks,
|
||||
source_audio_duration, audio_sampling_rate,
|
||||
new_audio_from_start=False,
|
||||
source_audio_metadata=None,
|
||||
audio_bitrate='128k',
|
||||
audio_codec='aac',
|
||||
verbose = False
|
||||
):
|
||||
inputs, filters, maps, idx = ['-i', video_path], [], ['-map', '0:v'], 1
|
||||
metadata_args = []
|
||||
sources = source_audio_tracks or []
|
||||
news = new_audio_tracks or []
|
||||
|
||||
duplicate_source = len(sources) == 1 and len(news) > 1
|
||||
N = len(news) if source_audio_duration == 0 else max(len(sources), len(news)) or 1
|
||||
|
||||
for i in range(N):
|
||||
s = (sources[i] if i < len(sources)
|
||||
else sources[0] if duplicate_source else None)
|
||||
n = news[i] if len(news) == N else (news[0] if news else None)
|
||||
|
||||
if source_audio_duration == 0:
|
||||
if n:
|
||||
inputs += ['-i', n]
|
||||
filters.append(f'[{idx}:a]apad=pad_dur=100[aout{i}]')
|
||||
idx += 1
|
||||
else:
|
||||
filters.append(f'anullsrc=r={audio_sampling_rate}:cl=mono,apad=pad_dur=100[aout{i}]')
|
||||
else:
|
||||
if s:
|
||||
inputs += ['-i', s]
|
||||
meta = source_audio_metadata[i] if source_audio_metadata and i < len(source_audio_metadata) else {}
|
||||
needs_filter = (
|
||||
meta.get('codec') != audio_codec or
|
||||
meta.get('sample_rate') != audio_sampling_rate or
|
||||
meta.get('channels') != 1 or
|
||||
meta.get('duration', 0) < source_audio_duration
|
||||
)
|
||||
if needs_filter:
|
||||
filters.append(
|
||||
f'[{idx}:a]aresample={audio_sampling_rate},aformat=channel_layouts=mono,'
|
||||
f'apad=pad_dur={source_audio_duration},atrim=0:{source_audio_duration},asetpts=PTS-STARTPTS[s{i}]')
|
||||
else:
|
||||
filters.append(
|
||||
f'[{idx}:a]apad=pad_dur={source_audio_duration},atrim=0:{source_audio_duration},asetpts=PTS-STARTPTS[s{i}]')
|
||||
if lang := meta.get('language'):
|
||||
metadata_args += ['-metadata:s:a:' + str(i), f'language={lang}']
|
||||
idx += 1
|
||||
else:
|
||||
filters.append(
|
||||
f'anullsrc=r={audio_sampling_rate}:cl=mono,atrim=0:{source_audio_duration},asetpts=PTS-STARTPTS[s{i}]')
|
||||
|
||||
if n:
|
||||
inputs += ['-i', n]
|
||||
start = '0' if new_audio_from_start else source_audio_duration
|
||||
filters.append(
|
||||
f'[{idx}:a]aresample={audio_sampling_rate},aformat=channel_layouts=mono,'
|
||||
f'atrim=start={start},asetpts=PTS-STARTPTS[n{i}]')
|
||||
filters.append(f'[s{i}][n{i}]concat=n=2:v=0:a=1[aout{i}]')
|
||||
idx += 1
|
||||
else:
|
||||
filters.append(f'[s{i}]apad=pad_dur=100[aout{i}]')
|
||||
|
||||
maps += ['-map', f'[aout{i}]']
|
||||
|
||||
cmd = ['ffmpeg', '-y', *inputs,
|
||||
'-filter_complex', ';'.join(filters), # ✅ Only change made
|
||||
*maps, *metadata_args,
|
||||
'-c:v', 'copy',
|
||||
'-c:a', audio_codec,
|
||||
'-b:a', audio_bitrate,
|
||||
'-ar', str(audio_sampling_rate),
|
||||
'-ac', '1',
|
||||
'-shortest', save_path_tmp]
|
||||
|
||||
if verbose:
|
||||
print(f"ffmpeg command: {cmd}")
|
||||
try:
|
||||
subprocess.run(cmd, check=True, capture_output=True, text=True)
|
||||
except subprocess.CalledProcessError as e:
|
||||
raise Exception(f"FFmpeg error: {e.stderr}")
|
||||
|
||||
|
||||
import ffmpeg
|
||||
|
||||
|
||||
import subprocess
|
||||
import ffmpeg
|
||||
|
||||
def combine_video_with_audio_tracks(target_video, audio_tracks, output_video,
|
||||
audio_metadata=None, verbose=False):
|
||||
if not audio_tracks:
|
||||
if verbose: print("No audio tracks to combine."); return False
|
||||
|
||||
dur = float(next(s for s in ffmpeg.probe(target_video)['streams']
|
||||
if s['codec_type'] == 'video')['duration'])
|
||||
if verbose: print(f"Video duration: {dur:.3f}s")
|
||||
|
||||
cmd = ['ffmpeg', '-y', '-i', target_video]
|
||||
for path in audio_tracks:
|
||||
cmd += ['-i', path]
|
||||
|
||||
cmd += ['-map', '0:v']
|
||||
for i in range(len(audio_tracks)):
|
||||
cmd += ['-map', f'{i+1}:a']
|
||||
|
||||
for i, meta in enumerate(audio_metadata or []):
|
||||
if (lang := meta.get('language')):
|
||||
cmd += ['-metadata:s:a:' + str(i), f'language={lang}']
|
||||
|
||||
cmd += ['-c:v', 'copy', '-c:a', 'copy', '-t', str(dur), output_video]
|
||||
|
||||
result = subprocess.run(cmd, capture_output=not verbose, text=True)
|
||||
if result.returncode != 0:
|
||||
raise Exception(f"FFmpeg error:\n{result.stderr}")
|
||||
if verbose:
|
||||
print(f"Created {output_video} with {len(audio_tracks)} audio track(s)")
|
||||
return True
|
||||
|
||||
|
||||
def cleanup_temp_audio_files(audio_tracks, verbose=False):
|
||||
"""
|
||||
Clean up temporary audio files.
|
||||
|
||||
Args:
|
||||
audio_tracks: List of audio file paths to delete
|
||||
verbose: Enable verbose output (default: False)
|
||||
|
||||
Returns:
|
||||
Number of files successfully deleted
|
||||
"""
|
||||
deleted_count = 0
|
||||
|
||||
for audio_path in audio_tracks:
|
||||
try:
|
||||
if os.path.exists(audio_path):
|
||||
os.unlink(audio_path)
|
||||
deleted_count += 1
|
||||
if verbose:
|
||||
print(f"Cleaned up {audio_path}")
|
||||
except PermissionError:
|
||||
print(f"Warning: Could not delete {audio_path} (file may be in use)")
|
||||
except Exception as e:
|
||||
print(f"Warning: Error deleting {audio_path}: {e}")
|
||||
|
||||
if verbose and deleted_count > 0:
|
||||
print(f"Successfully deleted {deleted_count} temporary audio file(s)")
|
||||
|
||||
return deleted_count
|
||||
|
||||
|
||||
154
wgp.py
154
wgp.py
@ -13,11 +13,13 @@ from datetime import datetime
|
||||
import gradio as gr
|
||||
import random
|
||||
import json
|
||||
import numpy as np
|
||||
import importlib
|
||||
from shared.utils import notification_sound
|
||||
from shared.utils.loras_mutipliers import preparse_loras_multipliers, parse_loras_multipliers
|
||||
from shared.utils.utils import cache_video, convert_tensor_to_image, save_image, get_video_info, get_file_creation_date, convert_image_to_video
|
||||
from shared.utils.utils import extract_audio_tracks, combine_video_with_audio_tracks, combine_and_concatenate_video_with_audio_tracks, cleanup_temp_audio_files, calculate_new_dimensions
|
||||
from shared.utils.utils import convert_tensor_to_image, save_image, get_video_info, get_file_creation_date, convert_image_to_video, calculate_new_dimensions
|
||||
from shared.utils.audio_video import extract_audio_tracks, combine_video_with_audio_tracks, combine_and_concatenate_video_with_audio_tracks, cleanup_temp_audio_files, save_video, save_image
|
||||
from shared.utils.audio_video import save_image_metadata, read_image_metadata
|
||||
from shared.match_archi import match_nvidia_architecture
|
||||
from shared.attention import get_attention_modes, get_supported_attention_modes
|
||||
from huggingface_hub import hf_hub_download, snapshot_download
|
||||
@ -53,7 +55,7 @@ AUTOSAVE_FILENAME = "queue.zip"
|
||||
PROMPT_VARS_MAX = 10
|
||||
|
||||
target_mmgp_version = "3.5.8"
|
||||
WanGP_version = "7.76"
|
||||
WanGP_version = "7.77"
|
||||
settings_version = 2.23
|
||||
max_source_video_frames = 3000
|
||||
prompt_enhancer_image_caption_model, prompt_enhancer_image_caption_processor, prompt_enhancer_llm_model, prompt_enhancer_llm_tokenizer = None, None, None, None
|
||||
@ -1712,7 +1714,8 @@ if not Path(server_config_filename).is_file():
|
||||
"transformer_types": [],
|
||||
"transformer_quantization": "int8",
|
||||
"text_encoder_quantization" : "int8",
|
||||
"save_path": "outputs", #os.path.join(os.getcwd(),
|
||||
"save_path": "outputs",
|
||||
"image_save_path": "outputs",
|
||||
"compile" : "",
|
||||
"metadata_type": "metadata",
|
||||
"boost" : 1,
|
||||
@ -2186,7 +2189,11 @@ if len(args.vae_config) > 0:
|
||||
vae_config = int(args.vae_config)
|
||||
|
||||
reload_needed = False
|
||||
save_path = server_config.get("save_path", os.path.join(os.getcwd(), "gradio_outputs"))
|
||||
save_path = server_config.get("save_path", os.path.join(os.getcwd(), "outputs"))
|
||||
image_save_path = server_config.get("image_save_path", os.path.join(os.getcwd(), "outputs"))
|
||||
if not "video_output_codec" in server_config: server_config["video_output_codec"]= "libx264_8"
|
||||
if not "image_output_codec" in server_config: server_config["image_output_codec"]= "jpeg_95"
|
||||
|
||||
preload_model_policy = server_config.get("preload_model_policy", [])
|
||||
|
||||
|
||||
@ -2699,6 +2706,7 @@ def apply_changes( state,
|
||||
VAE_precision_choice,
|
||||
mixed_precision_choice,
|
||||
save_path_choice,
|
||||
image_save_path_choice,
|
||||
attention_choice,
|
||||
compile_choice,
|
||||
profile_choice,
|
||||
@ -2718,6 +2726,9 @@ def apply_changes( state,
|
||||
notification_sound_volume_choice = 50,
|
||||
max_frames_multiplier_choice = 1,
|
||||
display_stats_choice = 0,
|
||||
video_output_codec_choice = None,
|
||||
image_output_codec_choice = None,
|
||||
audio_output_codec_choice = None,
|
||||
last_resolution_choice = None,
|
||||
):
|
||||
if args.lock_config:
|
||||
@ -2730,6 +2741,7 @@ def apply_changes( state,
|
||||
"transformer_types": transformer_types_choices,
|
||||
"text_encoder_quantization" : text_encoder_quantization_choice,
|
||||
"save_path" : save_path_choice,
|
||||
"image_save_path" : image_save_path_choice,
|
||||
"compile" : compile_choice,
|
||||
"profile" : profile_choice,
|
||||
"vae_config" : vae_config_choice,
|
||||
@ -2751,6 +2763,9 @@ def apply_changes( state,
|
||||
"notification_sound_volume" : notification_sound_volume_choice,
|
||||
"max_frames_multiplier" : max_frames_multiplier_choice,
|
||||
"display_stats" : display_stats_choice,
|
||||
"video_output_codec" : video_output_codec_choice,
|
||||
"image_output_codec" : image_output_codec_choice,
|
||||
"audio_output_codec" : audio_output_codec_choice,
|
||||
"last_model_type" : state["model_type"],
|
||||
"last_model_per_family": state["last_model_per_family"],
|
||||
"last_advanced_choice": state["advanced"],
|
||||
@ -2784,6 +2799,7 @@ def apply_changes( state,
|
||||
vae_config = server_config["vae_config"]
|
||||
boost = server_config["boost"]
|
||||
save_path = server_config["save_path"]
|
||||
image_save_path = server_config["image_save_path"]
|
||||
preload_model_policy = server_config["preload_model_policy"]
|
||||
transformer_quantization = server_config["transformer_quantization"]
|
||||
transformer_dtype_policy = server_config["transformer_dtype_policy"]
|
||||
@ -2791,7 +2807,9 @@ def apply_changes( state,
|
||||
transformer_types = server_config["transformer_types"]
|
||||
model_filename = get_model_filename(transformer_type, transformer_quantization, transformer_dtype_policy)
|
||||
state["model_filename"] = model_filename
|
||||
if all(change in ["attention_mode", "vae_config", "boost", "save_path", "metadata_type", "clear_file_list", "fit_canvas", "depth_anything_v2_variant", "notification_sound_enabled", "notification_sound_volume", "mmaudio_enabled", "max_frames_multiplier", "display_stats"] for change in changes ):
|
||||
if all(change in ["attention_mode", "vae_config", "boost", "save_path", "metadata_type", "clear_file_list", "fit_canvas", "depth_anything_v2_variant",
|
||||
"notification_sound_enabled", "notification_sound_volume", "mmaudio_enabled", "max_frames_multiplier", "display_stats",
|
||||
"video_output_codec", "image_output_codec", "audio_output_codec"] for change in changes ):
|
||||
model_family = gr.Dropdown()
|
||||
model_choice = gr.Dropdown()
|
||||
else:
|
||||
@ -2802,18 +2820,6 @@ def apply_changes( state,
|
||||
mmaudio_enabled = server_config["mmaudio_enabled"] > 0
|
||||
return "<DIV ALIGN=CENTER>The new configuration has been succesfully applied</DIV>", header, model_family, model_choice, gr.Row(visible= server_config["enhancer_enabled"] == 1), gr.Row(visible= mmaudio_enabled), gr.Column(visible= mmaudio_enabled)
|
||||
|
||||
|
||||
|
||||
from moviepy.editor import ImageSequenceClip
|
||||
import numpy as np
|
||||
|
||||
def save_video(final_frames, output_path, fps=24):
|
||||
assert final_frames.ndim == 4 and final_frames.shape[3] == 3, f"invalid shape: {final_frames} (need t h w c)"
|
||||
if final_frames.dtype != np.uint8:
|
||||
final_frames = (final_frames * 255).astype(np.uint8)
|
||||
ImageSequenceClip(list(final_frames), fps=fps).write_videofile(output_path, verbose= False)
|
||||
|
||||
|
||||
def get_gen_info(state):
|
||||
cache = state.get("gen", None)
|
||||
if cache == None:
|
||||
@ -3754,7 +3760,7 @@ def edit_video(
|
||||
any_change = False
|
||||
if sample != None:
|
||||
video_path =get_available_filename(save_path, video_source, "_tmp") if any_mmaudio or has_already_audio else get_available_filename(save_path, video_source, "_post")
|
||||
cache_video( tensor=sample[None], save_file=video_path, fps=output_fps, nrow=1, normalize=True, value_range=(-1, 1))
|
||||
save_video( tensor=sample[None], save_file=video_path, fps=output_fps, nrow=1, normalize=True, value_range=(-1, 1), codec_type= server_config.get("video_output_codec", None))
|
||||
|
||||
if any_mmaudio or has_already_audio: tmp_path = video_path
|
||||
any_change = True
|
||||
@ -3970,7 +3976,7 @@ def generate_video(
|
||||
process_map_video_guide = { "P": "pose", "D" : "depth", "S": "scribble", "E": "canny", "L": "flow", "C": "gray", "M": "inpaint", "U": "identity"}
|
||||
processes_names = { "pose": "Open Pose", "depth": "Depth Mask", "scribble" : "Shapes", "flow" : "Flow Map", "gray" : "Gray Levels", "inpaint" : "Inpaint Mask", "identity": "Identity Mask", "raw" : "Raw Format", "canny" : "Canny Edges"}
|
||||
|
||||
global wan_model, offloadobj, reload_needed, save_path
|
||||
global wan_model, offloadobj, reload_needed
|
||||
gen = get_gen_info(state)
|
||||
torch.set_grad_enabled(False)
|
||||
if mode.startswith("edit_"):
|
||||
@ -4238,6 +4244,7 @@ def generate_video(
|
||||
|
||||
torch.set_grad_enabled(False)
|
||||
os.makedirs(save_path, exist_ok=True)
|
||||
os.makedirs(image_save_path, exist_ok=True)
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
wan_model._interrupt = False
|
||||
@ -4709,17 +4716,18 @@ def generate_video(
|
||||
any_mmaudio = MMAudio_setting != 0 and server_config.get("mmaudio_enabled", 0) != 0 and sample.shape[1] >=fps
|
||||
|
||||
if is_image:
|
||||
sample = sample.permute(1,2,3,0) #c f h w -> f h w c
|
||||
new_video_path = []
|
||||
image_path = os.path.join(image_save_path, file_name)
|
||||
sample = sample.transpose(1,0) #c f h w -> f c h w
|
||||
new_image_path = []
|
||||
for no, img in enumerate(sample):
|
||||
img = Image.fromarray((127.5 * (img + 1.0)).cpu().byte().numpy())
|
||||
img_path = os.path.splitext(video_path)[0] + ("" if no==0 else f"_{no}") + ".jpg"
|
||||
new_video_path.append(img_path)
|
||||
img.save(img_path)
|
||||
video_path= new_video_path
|
||||
img_path = os.path.splitext(image_path)[0] + ("" if no==0 else f"_{no}") + ".jpg"
|
||||
new_image_path.append(save_image(img, save_file = img_path, quality = server_config.get("image_output_codec", None)))
|
||||
|
||||
video_path= new_image_path
|
||||
elif len(control_audio_tracks) > 0 or len(source_audio_tracks) > 0 or output_new_audio_filepath is not None or any_mmaudio or output_new_audio_data is not None or audio_source is not None:
|
||||
video_path = os.path.join(save_path, file_name)
|
||||
save_path_tmp = video_path[:-4] + "_tmp.mp4"
|
||||
cache_video( tensor=sample[None], save_file=save_path_tmp, fps=output_fps, nrow=1, normalize=True, value_range=(-1, 1))
|
||||
save_video( tensor=sample[None], save_file=save_path_tmp, fps=output_fps, nrow=1, normalize=True, value_range=(-1, 1), codec_type = server_config.get("video_output_codec", None))
|
||||
output_new_audio_temp_filepath = None
|
||||
new_audio_from_start = reset_control_aligment
|
||||
source_audio_duration = source_video_frames_count / fps
|
||||
@ -4746,7 +4754,7 @@ def generate_video(
|
||||
if output_new_audio_temp_filepath is not None: os.remove(output_new_audio_temp_filepath)
|
||||
|
||||
else:
|
||||
cache_video( tensor=sample[None], save_file=video_path, fps=output_fps, nrow=1, normalize=True, value_range=(-1, 1))
|
||||
save_video( tensor=sample[None], save_file=video_path, fps=output_fps, nrow=1, normalize=True, value_range=(-1, 1), codec_type= server_config.get("video_output_codec", None))
|
||||
|
||||
end_time = time.time()
|
||||
|
||||
@ -4756,6 +4764,11 @@ def generate_video(
|
||||
inputs.pop("mode")
|
||||
inputs["model_type"] = model_type
|
||||
inputs["model_filename"] = original_filename
|
||||
if is_image:
|
||||
inputs["image_quality"] = server_config.get("image_output_codec", None)
|
||||
else:
|
||||
inputs["video_quality"] = server_config.get("video_output_codec", None)
|
||||
|
||||
modules = get_model_recursive_prop(model_type, "modules", return_list= True)
|
||||
if len(modules) > 0 : inputs["modules"] = modules
|
||||
if len(transformer_loras_filenames) > 0:
|
||||
@ -4778,8 +4791,7 @@ def generate_video(
|
||||
json.dump(configs, f, indent=4)
|
||||
elif metadata_choice == "metadata":
|
||||
if is_image:
|
||||
with Image.open(path) as img:
|
||||
img.save(path, comment=json.dumps(configs))
|
||||
save_image_metadata(path, configs)
|
||||
else:
|
||||
from mutagen.mp4 import MP4
|
||||
file = MP4(path)
|
||||
@ -5690,7 +5702,7 @@ def has_video_file_extension(filename):
|
||||
|
||||
def has_image_file_extension(filename):
|
||||
extension = os.path.splitext(filename)[-1]
|
||||
return extension in [".jpeg", ".jpg", ".png", ".bmp", ".tiff"]
|
||||
return extension in [".jpeg", ".jpg", ".png", ".webp", ".bmp", ".tiff"]
|
||||
|
||||
def add_videos_to_gallery(state, input_file_list, choice, files_to_load):
|
||||
gen = get_gen_info(state)
|
||||
@ -5795,7 +5807,7 @@ def use_video_settings(state, input_file_list, choice):
|
||||
|
||||
def get_settings_from_file(state, file_path, allow_json, merge_with_defaults, switch_type_if_compatible):
|
||||
configs = None
|
||||
tags = None
|
||||
any_image_or_video = False
|
||||
if file_path.endswith(".json") and allow_json:
|
||||
try:
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
@ -5807,22 +5819,22 @@ def get_settings_from_file(state, file_path, allow_json, merge_with_defaults, sw
|
||||
try:
|
||||
file = MP4(file_path)
|
||||
tags = file.tags['©cmt'][0]
|
||||
configs = json.loads(tags)
|
||||
any_image_or_video = True
|
||||
except:
|
||||
pass
|
||||
elif has_image_file_extension(file_path):
|
||||
try:
|
||||
with Image.open(file_path) as img:
|
||||
tags = img.info["comment"]
|
||||
configs = read_image_metadata(file_path)
|
||||
any_image_or_video = True
|
||||
except:
|
||||
pass
|
||||
if tags is not None:
|
||||
if configs is None: return None, False
|
||||
try:
|
||||
configs = json.loads(tags)
|
||||
if not "WanGP" in configs.get("type", ""): configs = None
|
||||
except:
|
||||
configs = None
|
||||
if configs == None:
|
||||
return None, False
|
||||
|
||||
|
||||
current_model_filename = state["model_filename"]
|
||||
current_model_type = state["model_type"]
|
||||
@ -5848,7 +5860,7 @@ def get_settings_from_file(state, file_path, allow_json, merge_with_defaults, sw
|
||||
configs = defaults
|
||||
configs["model_type"] = model_type
|
||||
|
||||
return configs, tags != None
|
||||
return configs, any_image_or_video
|
||||
|
||||
def record_image_mode_tab(state, evt:gr.SelectData):
|
||||
state["image_mode_tab"] = 0 if evt.index ==0 else 1
|
||||
@ -7849,10 +7861,6 @@ def generate_configuration_tab(state, blocks, header, model_family, model_choice
|
||||
label="User Interface Theme. You will need to restart the App the see new Theme."
|
||||
)
|
||||
|
||||
save_path_choice = gr.Textbox(
|
||||
label="Output Folder for Generated Videos (need to restart app to be taken into account)",
|
||||
value=server_config.get("save_path", save_path)
|
||||
)
|
||||
|
||||
with gr.Tab("Performance"):
|
||||
|
||||
@ -7976,6 +7984,53 @@ def generate_configuration_tab(state, blocks, header, model_family, model_choice
|
||||
label="MMAudio (if enabled, 10 GB of extra models will be downloaded)"
|
||||
)
|
||||
|
||||
with gr.Tab("Outputs"):
|
||||
|
||||
video_output_codec_choice = gr.Dropdown(
|
||||
choices=[
|
||||
("x265 Balanced Quality (CRF 28)", 'libx265_28'),
|
||||
("x264 Balanced Quality (Level 8)", 'libx264_8'),
|
||||
("x265 High Quality (CRF 8)", 'libx265_8'),
|
||||
("x264 High Quality (Level 10)", 'libx264_10'),
|
||||
("x264 Lossless", 'libx264_lossless'),
|
||||
],
|
||||
value=server_config.get("video_output_codec", "libx264_8"),
|
||||
label="Video Codec to use"
|
||||
)
|
||||
|
||||
image_output_codec_choice = gr.Dropdown(
|
||||
choices=[
|
||||
("JPEG Quality 85", 'jpeg_85'),
|
||||
("WEBP Quality 85", 'webp_85'),
|
||||
("JPEG Quality 95", 'jpeg_95'),
|
||||
("WEBP Quality 95", 'webp_95'),
|
||||
("WEBP Lossless", 'webp_lossless'),
|
||||
("PNG Lossless", 'png'),
|
||||
],
|
||||
value=server_config.get("image_output_codec", "jpeg_95"),
|
||||
label="Image Codec to use"
|
||||
)
|
||||
|
||||
audio_output_codec_choice = gr.Dropdown(
|
||||
choices=[
|
||||
("AAC 128 kbit", 'aac_128'),
|
||||
],
|
||||
value=server_config.get("audio_output_codec", "aac_128"),
|
||||
visible = False,
|
||||
label="Audio Codec to use"
|
||||
)
|
||||
|
||||
video_save_path_choice = gr.Textbox(
|
||||
label="Output Folder for Generated Videos (need to restart app to be taken into account)",
|
||||
value=server_config.get("save_path", save_path)
|
||||
)
|
||||
|
||||
image_save_path_choice = gr.Textbox(
|
||||
label="Output Folder for Generated Images (need to restart app to be taken into account)",
|
||||
value=server_config.get("image_save_path", image_save_path)
|
||||
)
|
||||
|
||||
|
||||
with gr.Tab("Notifications"):
|
||||
gr.Markdown("### Notification Settings")
|
||||
notification_sound_enabled_choice = gr.Dropdown(
|
||||
@ -8008,7 +8063,8 @@ def generate_configuration_tab(state, blocks, header, model_family, model_choice
|
||||
text_encoder_quantization_choice,
|
||||
VAE_precision_choice,
|
||||
mixed_precision_choice,
|
||||
save_path_choice,
|
||||
video_save_path_choice,
|
||||
image_save_path_choice,
|
||||
attention_choice,
|
||||
compile_choice,
|
||||
profile_choice,
|
||||
@ -8028,6 +8084,9 @@ def generate_configuration_tab(state, blocks, header, model_family, model_choice
|
||||
notification_sound_volume_choice,
|
||||
max_frames_multiplier_choice,
|
||||
display_stats_choice,
|
||||
video_output_codec_choice,
|
||||
image_output_codec_choice,
|
||||
audio_output_codec_choice,
|
||||
resolution,
|
||||
],
|
||||
outputs= [msg , header, model_family, model_choice, prompt_enhancer_row, mmaudio_tab, PP_MMAudio_col]
|
||||
@ -8626,7 +8685,7 @@ def create_ui():
|
||||
with gr.Tab("Guides", id="info") as info_tab:
|
||||
generate_info_tab()
|
||||
with gr.Tab("Video Mask Creator", id="video_mask_creator") as video_mask_creator:
|
||||
matanyone_app.display(main_tabs, tab_state, video_guide, image_guide, video_mask, image_mask, image_refs)
|
||||
matanyone_app.display(main_tabs, tab_state, server_config, video_guide, image_guide, video_mask, image_mask, image_refs)
|
||||
if not args.lock_config:
|
||||
with gr.Tab("Downloads", id="downloads") as downloads_tab:
|
||||
generate_download_tab(lset_name, loras_choices, state)
|
||||
@ -8662,5 +8721,4 @@ if __name__ == "__main__":
|
||||
else:
|
||||
url = "http://" + server_name
|
||||
webbrowser.open(url + ":" + str(server_port), new = 0, autoraise = True)
|
||||
demo.launch(favicon_path="favicon.png", server_name=server_name, server_port=server_port, share=args.share, allowed_paths=[save_path])
|
||||
|
||||
demo.launch(favicon_path="favicon.png", server_name=server_name, server_port=server_port, share=args.share, allowed_paths=[save_path] + [] if save_path == image_save_path else [image_save_path] )
|
||||
Loading…
Reference in New Issue
Block a user