mirror of
https://github.com/Wan-Video/Wan2.1.git
synced 2025-11-04 14:16:57 +00:00
70 lines
2.2 KiB
Python
70 lines
2.2 KiB
Python
from pathlib import Path
|
|
import os, tempfile
|
|
import numpy as np
|
|
import soundfile as sf
|
|
import librosa
|
|
import torch
|
|
import gc
|
|
|
|
from audio_separator.separator import Separator
|
|
|
|
def get_vocals(src_path: str, dst_path: str, min_seconds: float = 8) -> str:
|
|
"""
|
|
If the source audio is shorter than `min_seconds`, pad with trailing silence
|
|
in a temporary file, then run separation and save only the vocals to dst_path.
|
|
Returns the full path to the vocals file.
|
|
"""
|
|
|
|
default_device = torch.get_default_device()
|
|
torch.set_default_device('cpu')
|
|
|
|
dst = Path(dst_path)
|
|
dst.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Quick duration check
|
|
duration = librosa.get_duration(path=src_path)
|
|
|
|
use_path = src_path
|
|
temp_path = None
|
|
try:
|
|
if duration < min_seconds:
|
|
# Load (resample) and pad in memory
|
|
y, sr = librosa.load(src_path, sr=None, mono=False)
|
|
if y.ndim == 1: # ensure shape (channels, samples)
|
|
y = y[np.newaxis, :]
|
|
target_len = int(min_seconds * sr)
|
|
pad = max(0, target_len - y.shape[1])
|
|
if pad:
|
|
y = np.pad(y, ((0, 0), (0, pad)), mode="constant")
|
|
|
|
# Write a temp WAV for the separator
|
|
fd, temp_path = tempfile.mkstemp(suffix=".wav")
|
|
os.close(fd)
|
|
sf.write(temp_path, y.T, sr) # soundfile expects (frames, channels)
|
|
use_path = temp_path
|
|
|
|
# Run separation: emit only the vocals, with your exact filename
|
|
sep = Separator(
|
|
output_dir=str(dst.parent),
|
|
output_format=(dst.suffix.lstrip(".") or "wav"),
|
|
output_single_stem="Vocals",
|
|
model_file_dir="ckpts/roformer/" #model_bs_roformer_ep_317_sdr_12.9755.ckpt"
|
|
)
|
|
sep.load_model()
|
|
out_files = sep.separate(use_path, {"Vocals": dst.stem})
|
|
|
|
out = Path(out_files[0])
|
|
return str(out if out.is_absolute() else (dst.parent / out))
|
|
finally:
|
|
if temp_path and os.path.exists(temp_path):
|
|
os.remove(temp_path)
|
|
|
|
torch.cuda.empty_cache()
|
|
gc.collect()
|
|
torch.set_default_device(default_device)
|
|
|
|
# Example:
|
|
# final = extract_vocals("in/clip.mp3", "out/vocals.wav")
|
|
# print(final)
|
|
|