mirror of
				https://github.com/Wan-Video/Wan2.1.git
				synced 2025-11-04 06:15:17 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			149 lines
		
	
	
		
			4.0 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			149 lines
		
	
	
		
			4.0 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
"""Kokoro TTS CLI
 | 
						|
Example usage:
 | 
						|
python3 -m kokoro --text "The sky above the port was the color of television, tuned to a dead channel." -o file.wav --debug
 | 
						|
 | 
						|
echo "Bom dia mundo, como vão vocês" > text.txt
 | 
						|
python3 -m kokoro -i text.txt -l p --voice pm_alex > audio.wav
 | 
						|
 | 
						|
Common issues:
 | 
						|
pip not installed: `uv pip install pip`
 | 
						|
(Temporary workaround while https://github.com/explosion/spaCy/issues/13747 is not fixed)
 | 
						|
 | 
						|
espeak not installed: `apt-get install espeak-ng`
 | 
						|
"""
 | 
						|
 | 
						|
import argparse
 | 
						|
import wave
 | 
						|
from pathlib import Path
 | 
						|
from typing import Generator, TYPE_CHECKING
 | 
						|
 | 
						|
import numpy as np
 | 
						|
from loguru import logger
 | 
						|
 | 
						|
languages = [
 | 
						|
    "a",  # American English
 | 
						|
    "b",  # British English
 | 
						|
    "h",  # Hindi
 | 
						|
    "e",  # Spanish
 | 
						|
    "f",  # French
 | 
						|
    "i",  # Italian
 | 
						|
    "p",  # Brazilian Portuguese
 | 
						|
    "j",  # Japanese
 | 
						|
    "z",  # Mandarin Chinese
 | 
						|
]
 | 
						|
 | 
						|
if TYPE_CHECKING:
 | 
						|
    from kokoro import KPipeline
 | 
						|
 | 
						|
 | 
						|
def generate_audio(
 | 
						|
    text: str, kokoro_language: str, voice: str, speed=1
 | 
						|
) -> Generator["KPipeline.Result", None, None]:
 | 
						|
    from kokoro import KPipeline
 | 
						|
 | 
						|
    if not voice.startswith(kokoro_language):
 | 
						|
        logger.warning(f"Voice {voice} is not made for language {kokoro_language}")
 | 
						|
    pipeline = KPipeline(lang_code=kokoro_language)
 | 
						|
    yield from pipeline(text, voice=voice, speed=speed, split_pattern=r"\n+")
 | 
						|
 | 
						|
 | 
						|
def generate_and_save_audio(
 | 
						|
    output_file: Path, text: str, kokoro_language: str, voice: str, speed=1
 | 
						|
) -> None:
 | 
						|
    with wave.open(str(output_file.resolve()), "wb") as wav_file:
 | 
						|
        wav_file.setnchannels(1)  # Mono audio
 | 
						|
        wav_file.setsampwidth(2)  # 2 bytes per sample (16-bit audio)
 | 
						|
        wav_file.setframerate(24000)  # Sample rate
 | 
						|
 | 
						|
        for result in generate_audio(
 | 
						|
            text, kokoro_language=kokoro_language, voice=voice, speed=speed
 | 
						|
        ):
 | 
						|
            logger.debug(result.phonemes)
 | 
						|
            if result.audio is None:
 | 
						|
                continue
 | 
						|
            audio_bytes = (result.audio.numpy() * 32767).astype(np.int16).tobytes()
 | 
						|
            wav_file.writeframes(audio_bytes)
 | 
						|
 | 
						|
 | 
						|
def main() -> None:
 | 
						|
    parser = argparse.ArgumentParser()
 | 
						|
    parser.add_argument(
 | 
						|
        "-m",
 | 
						|
        "--voice",
 | 
						|
        default="af_heart",
 | 
						|
        help="Voice to use",
 | 
						|
    )
 | 
						|
    parser.add_argument(
 | 
						|
        "-l",
 | 
						|
        "--language",
 | 
						|
        help="Language to use (defaults to the one corresponding to the voice)",
 | 
						|
        choices=languages,
 | 
						|
    )
 | 
						|
    parser.add_argument(
 | 
						|
        "-o",
 | 
						|
        "--output-file",
 | 
						|
        "--output_file",
 | 
						|
        type=Path,
 | 
						|
        help="Path to output WAV file",
 | 
						|
        required=True,
 | 
						|
    )
 | 
						|
    parser.add_argument(
 | 
						|
        "-i",
 | 
						|
        "--input-file",
 | 
						|
        "--input_file",
 | 
						|
        type=Path,
 | 
						|
        help="Path to input text file (default: stdin)",
 | 
						|
    )
 | 
						|
    parser.add_argument(
 | 
						|
        "-t",
 | 
						|
        "--text",
 | 
						|
        help="Text to use instead of reading from stdin",
 | 
						|
    )
 | 
						|
    parser.add_argument(
 | 
						|
        "-s",
 | 
						|
        "--speed",
 | 
						|
        type=float,
 | 
						|
        default=1.0,
 | 
						|
        help="Speech speed",
 | 
						|
    )
 | 
						|
    parser.add_argument(
 | 
						|
        "--debug",
 | 
						|
        action="store_true",
 | 
						|
        help="Print DEBUG messages to console",
 | 
						|
    )
 | 
						|
    args = parser.parse_args()
 | 
						|
    if args.debug:
 | 
						|
        logger.level("DEBUG")
 | 
						|
    logger.debug(args)
 | 
						|
 | 
						|
    lang = args.language or args.voice[0]
 | 
						|
 | 
						|
    if args.text is not None and args.input_file is not None:
 | 
						|
        raise Exception("You cannot specify both 'text' and 'input_file'")
 | 
						|
    elif args.text:
 | 
						|
        text = args.text
 | 
						|
    elif args.input_file:
 | 
						|
        file: Path = args.input_file
 | 
						|
        text = file.read_text()
 | 
						|
    else:
 | 
						|
        import sys
 | 
						|
        print("Press Ctrl+D to stop reading input and start generating", flush=True)
 | 
						|
        text = '\n'.join(sys.stdin)
 | 
						|
 | 
						|
    logger.debug(f"Input text: {text!r}")
 | 
						|
 | 
						|
    out_file: Path = args.output_file
 | 
						|
    if not out_file.suffix == ".wav":
 | 
						|
        logger.warning("The output file name should end with .wav")
 | 
						|
    generate_and_save_audio(
 | 
						|
        output_file=out_file,
 | 
						|
        text=text,
 | 
						|
        kokoro_language=lang,
 | 
						|
        voice=args.voice,
 | 
						|
        speed=args.speed,
 | 
						|
    )
 | 
						|
 | 
						|
 | 
						|
if __name__ == "__main__":
 | 
						|
    main()
 |