Wan2.1/preprocessing/extract_vocals.py

from pathlib import Path
import os, tempfile
import numpy as np
import soundfile as sf
import librosa
import torch
import gc

from audio_separator.separator import Separator

def get_vocals(src_path: str, dst_path: str, min_seconds: float = 8) -> str:
    """
    If the source audio is shorter than `min_seconds`, pad with trailing silence
    in a temporary file, then run separation and save only the vocals to dst_path.
    Returns the full path to the vocals file.
    """

    default_device = torch.get_default_device()
    torch.set_default_device('cpu')

    dst = Path(dst_path)
    dst.parent.mkdir(parents=True, exist_ok=True)

    # Quick duration check
    duration = librosa.get_duration(path=src_path)

    use_path = src_path
    temp_path = None
    try:
        if duration < min_seconds:
            # Load (resample) and pad in memory
            y, sr = librosa.load(src_path, sr=None, mono=False)
            if y.ndim == 1:  # ensure shape (channels, samples)
                y = y[np.newaxis, :]
            target_len = int(min_seconds * sr)
            pad = max(0, target_len - y.shape[1])
            if pad:
                y = np.pad(y, ((0, 0), (0, pad)), mode="constant")

            # Write a temp WAV for the separator
            fd, temp_path = tempfile.mkstemp(suffix=".wav")
            os.close(fd)
            sf.write(temp_path, y.T, sr)  # soundfile expects (frames, channels)
            use_path = temp_path

        # Run separation: emit only the vocals, with your exact filename
        sep = Separator(
            output_dir=str(dst.parent),
            output_format=(dst.suffix.lstrip(".") or "wav"),
            output_single_stem="Vocals",
            model_file_dir="ckpts/roformer/" #model_bs_roformer_ep_317_sdr_12.9755.ckpt"
        )
        sep.load_model()
        out_files = sep.separate(use_path, {"Vocals": dst.stem})

        out = Path(out_files[0])
        return str(out if out.is_absolute() else (dst.parent / out))
    finally:
        if temp_path and os.path.exists(temp_path):
            os.remove(temp_path)

        torch.cuda.empty_cache()
        gc.collect()
        torch.set_default_device(default_device)

# Example:
# final = extract_vocals("in/clip.mp3", "out/vocals.wav")
# print(final)