Merge pull request #500 from psyb0t/feature_add-cuda-docker-runner

Add Docker support with automatic GPU detection and optimization for Debian/Ubuntu systems
2026-02-06 10:47:46 +00:00 · 2025-09-27 12:58:29 +02:00 · 2025-09-27 12:58:29 +02:00 · d16cc19945
commit d16cc19945
parent 6dfd173152 b28cb446bb
4 changed files with 449 additions and 2 deletions
--- a/92
+++ b/92
@ -0,0 +1,92 @@
+FROM nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04
+
+# Build arg for GPU architectures - specify which CUDA compute capabilities to compile for
+# Common values:
+#   7.0  - Tesla V100
+#   7.5  - RTX 2060, 2070, 2080, Titan RTX
+#   8.0  - A100, A800 (Ampere data center)
+#   8.6  - RTX 3060, 3070, 3080, 3090 (Ampere consumer)
+#   8.9  - RTX 4070, 4080, 4090 (Ada Lovelace)
+#   9.0  - H100, H800 (Hopper data center)
+#   12.0 - RTX 5070, 5080, 5090 (Blackwell) - Note: sm_120 architecture
+#
+# Examples:
+#   RTX 3060: --build-arg CUDA_ARCHITECTURES="8.6"
+#   RTX 4090: --build-arg CUDA_ARCHITECTURES="8.9"
+#   Multiple: --build-arg CUDA_ARCHITECTURES="8.0;8.6;8.9"
+#
+# Note: Including 8.9 or 9.0 may cause compilation issues on some setups
+# Default includes 8.0 and 8.6 for broad Ampere compatibility
+ARG CUDA_ARCHITECTURES="8.0;8.6"
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+# Install system dependencies
+RUN apt update && \
+    apt install -y \
+    python3 python3-pip git wget curl cmake ninja-build \
+    libgl1 libglib2.0-0 ffmpeg && \
+    apt clean
+
+WORKDIR /workspace
+
+COPY requirements.txt .
+
+# Upgrade pip first
+RUN pip install --upgrade pip setuptools wheel
+
+# Install requirements if exists
+RUN pip install -r requirements.txt
+
+# Install PyTorch with CUDA support
+RUN pip install --extra-index-url https://download.pytorch.org/whl/cu124 \
+    torch==2.6.0+cu124 torchvision==0.21.0+cu124
+
+# Install SageAttention from git (patch GPU detection)
+ENV TORCH_CUDA_ARCH_LIST="${CUDA_ARCHITECTURES}"
+ENV FORCE_CUDA="1"
+ENV MAX_JOBS="1"
+
+COPY <<EOF /tmp/patch_setup.py
+import os
+with open('setup.py', 'r') as f:
+    content = f.read()
+
+# Get architectures from environment variable
+arch_list = os.environ.get('TORCH_CUDA_ARCH_LIST')
+arch_set = '{' + ', '.join([f'"{arch}"' for arch in arch_list.split(';')]) + '}'
+
+# Replace the GPU detection section
+old_section = '''compute_capabilities = set()
+device_count = torch.cuda.device_count()
+for i in range(device_count):
+    major, minor = torch.cuda.get_device_capability(i)
+    if major < 8:
+        warnings.warn(f"skipping GPU {i} with compute capability {major}.{minor}")
+        continue
+    compute_capabilities.add(f"{major}.{minor}")'''
+
+new_section = 'compute_capabilities = ' + arch_set + '''
+print(f"Manually set compute capabilities: {compute_capabilities}")'''
+
+content = content.replace(old_section, new_section)
+
+with open('setup.py', 'w') as f:
+    f.write(content)
+EOF
+
+RUN git clone https://github.com/thu-ml/SageAttention.git /tmp/sageattention && \
+    cd /tmp/sageattention && \
+    python3 /tmp/patch_setup.py && \
+    pip install --no-build-isolation .
+
+RUN useradd -u 1000 -ms /bin/bash user
+
+RUN chown -R user:user /workspace
+
+RUN mkdir /home/user/.cache && \
+    chown -R user:user /home/user/.cache
+
+COPY entrypoint.sh /workspace/entrypoint.sh
+
+ENTRYPOINT ["/workspace/entrypoint.sh"]
--- a/README.md
+++ b/README.md
@ -13,7 +13,7 @@ WanGP supports the Wan (and derived models), Hunyuan Video and LTV Video models
 - Auto download of the required model adapted to your specific architecture
 - Tools integrated to facilitate Video Generation : Mask Editor, Prompt Enhancer, Temporal and Spatial Generation, MMAudio, Video Browser, Pose / Depth / Flow extractor
 - Loras Support to customize each model
- Queuing system : make your shopping list of videos to generate and come back later 
+- Queuing system : make your shopping list of videos to generate and come back later

 **Discord Server to get Help from Other Users and show your Best Videos:** https://discord.gg/g7efUW9jGV

@ -122,6 +122,33 @@ See full changelog: **[Changelog](docs/CHANGELOG.md)**

 ## 🚀 Quick Start

+### 🐳 Docker:
+
+**For Debian-based systems (Ubuntu, Debian, etc.):**
+
+```bash
+./run-docker-cuda-deb.sh
+```
+
+This automated script will:
+
+- Detect your GPU model and VRAM automatically
+- Select optimal CUDA architecture for your GPU
+- Install NVIDIA Docker runtime if needed
+- Build a Docker image with all dependencies
+- Run WanGP with optimal settings for your hardware
+
+**Docker environment includes:**
+
+- NVIDIA CUDA 12.4.1 with cuDNN support
+- PyTorch 2.6.0 with CUDA 12.4 support
+- SageAttention compiled for your specific GPU architecture
+- Optimized environment variables for performance (TF32, threading, etc.)
+- Automatic cache directory mounting for faster subsequent runs
+- Current directory mounted in container - all downloaded models, loras, generated videos and files are saved locally
+
+**Supported GPUs:** RTX 50XX, RTX 40XX, RTX 30XX, RTX 20XX, GTX 16XX, GTX 10XX, Tesla V100, A100, H100, and more.
+
 **One-click installation:** Get started instantly with [Pinokio App](https://pinokio.computer/)

 **Manual installation:**
@ -217,4 +244,4 @@ https://www.youtube.com/watch?v=T5jNiEhf9xk

 <p align="center">
 Made with ❤️ by DeepBeepMeep
-</p> 
+</p>
--- a/entrypoint.sh
+++ b/entrypoint.sh
@ -0,0 +1,118 @@
+#!/usr/bin/env bash
+export HOME=/home/user
+export PYTHONUNBUFFERED=1
+export HF_HOME=/home/user/.cache/huggingface
+
+export OMP_NUM_THREADS=$(nproc)
+export MKL_NUM_THREADS=$(nproc)
+export OPENBLAS_NUM_THREADS=$(nproc)
+export NUMEXPR_NUM_THREADS=$(nproc)
+
+export TORCH_ALLOW_TF32_CUBLAS=1
+export TORCH_ALLOW_TF32_CUDNN=1
+
+# Disable audio warnings in Docker
+export SDL_AUDIODRIVER=dummy
+export PULSE_RUNTIME_PATH=/tmp/pulse-runtime
+
+# ═══════════════════════════ CUDA DEBUG CHECKS ═══════════════════════════
+
+echo "🔍 CUDA Environment Debug Information:"
+echo "═══════════════════════════════════════════════════════════════════════"
+
+# Check CUDA driver on host (if accessible)
+if command -v nvidia-smi >/dev/null 2>&1; then
+    echo "✅ nvidia-smi available"
+    echo "📊 GPU Information:"
+    nvidia-smi --query-gpu=name,driver_version,memory.total,memory.free --format=csv,noheader,nounits 2>/dev/null || echo "❌ nvidia-smi failed to query GPU"
+    echo "🏃 Running Processes:"
+    nvidia-smi --query-compute-apps=pid,name,used_memory --format=csv,noheader,nounits 2>/dev/null || echo "ℹ️  No running CUDA processes"
+else
+    echo "❌ nvidia-smi not available in container"
+fi
+
+# Check CUDA runtime libraries
+echo ""
+echo "🔧 CUDA Runtime Check:"
+if ls /usr/local/cuda*/lib*/libcudart.so* >/dev/null 2>&1; then
+    echo "✅ CUDA runtime libraries found:"
+    ls /usr/local/cuda*/lib*/libcudart.so* 2>/dev/null
+else
+    echo "❌ CUDA runtime libraries not found"
+fi
+
+# Check CUDA devices
+echo ""
+echo "🖥️  CUDA Device Files:"
+if ls /dev/nvidia* >/dev/null 2>&1; then
+    echo "✅ NVIDIA device files found:"
+    ls -la /dev/nvidia* 2>/dev/null
+else
+    echo "❌ No NVIDIA device files found - Docker may not have GPU access"
+fi
+
+# Check CUDA environment variables
+echo ""
+echo "🌍 CUDA Environment Variables:"
+echo "   CUDA_HOME: ${CUDA_HOME:-not set}"
+echo "   CUDA_ROOT: ${CUDA_ROOT:-not set}"
+echo "   CUDA_PATH: ${CUDA_PATH:-not set}"
+echo "   LD_LIBRARY_PATH: ${LD_LIBRARY_PATH:-not set}"
+echo "   TORCH_CUDA_ARCH_LIST: ${TORCH_CUDA_ARCH_LIST:-not set}"
+echo "   CUDA_VISIBLE_DEVICES: ${CUDA_VISIBLE_DEVICES:-not set}"
+
+# Check PyTorch CUDA availability
+echo ""
+echo "🐍 PyTorch CUDA Check:"
+python3 -c "
+import sys
+try:
+    import torch
+    print('✅ PyTorch imported successfully')
+    print(f'   Version: {torch.__version__}')
+    print(f'   CUDA available: {torch.cuda.is_available()}')
+    if torch.cuda.is_available():
+        print(f'   CUDA version: {torch.version.cuda}')
+        print(f'   cuDNN version: {torch.backends.cudnn.version()}')
+        print(f'   Device count: {torch.cuda.device_count()}')
+        for i in range(torch.cuda.device_count()):
+            props = torch.cuda.get_device_properties(i)
+            print(f'   Device {i}: {props.name} (SM {props.major}.{props.minor}, {props.total_memory//1024//1024}MB)')
+    else:
+        print('❌ CUDA not available to PyTorch')
+        print('   This could mean:')
+        print('   - CUDA runtime not properly installed')
+        print('   - GPU not accessible to container')
+        print('   - Driver/runtime version mismatch')
+except ImportError as e:
+    print(f'❌ Failed to import PyTorch: {e}')
+except Exception as e:
+    print(f'❌ PyTorch CUDA check failed: {e}')
+" 2>&1
+
+# Check for common CUDA issues
+echo ""
+echo "🩺 Common Issue Diagnostics:"
+
+# Check if running with proper Docker flags
+if [ ! -e /dev/nvidia0 ] && [ ! -e /dev/nvidiactl ]; then
+    echo "❌ No NVIDIA device nodes - container likely missing --gpus all or --runtime=nvidia"
+fi
+
+# Check CUDA library paths
+if [ -z "$LD_LIBRARY_PATH" ] || ! echo "$LD_LIBRARY_PATH" | grep -q cuda; then
+    echo "⚠️  LD_LIBRARY_PATH may not include CUDA libraries"
+fi
+
+# Check permissions on device files
+if ls /dev/nvidia* >/dev/null 2>&1; then
+    if ! ls -la /dev/nvidia* | grep -q "rw-rw-rw-\|rw-r--r--"; then
+        echo "⚠️  NVIDIA device files may have restrictive permissions"
+    fi
+fi
+
+echo "═══════════════════════════════════════════════════════════════════════"
+echo "🚀 Starting application..."
+echo ""
+
+exec su -p user -c "python3 wgp.py --listen $*"
--- a/run-docker-cuda-deb.sh
+++ b/run-docker-cuda-deb.sh
@ -0,0 +1,210 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# ───────────────────────── helpers ─────────────────────────
+
+install_nvidia_smi_if_missing() {
+    if command -v nvidia-smi &>/dev/null; then
+        return
+    fi
+
+    echo "⚠️  nvidia-smi not found. Installing nvidia-utils…"
+    if [ "$EUID" -ne 0 ]; then
+        SUDO='sudo'
+    else
+        SUDO=''
+    fi
+
+    $SUDO apt-get update
+    $SUDO apt-get install -y nvidia-utils-535 || $SUDO apt-get install -y nvidia-utils
+
+    if ! command -v nvidia-smi &>/dev/null; then
+        echo "❌ Failed to install nvidia-smi. Cannot detect GPU architecture."
+        exit 1
+    fi
+    echo "✅ nvidia-smi installed successfully."
+}
+
+detect_gpu_name() {
+    install_nvidia_smi_if_missing
+    nvidia-smi --query-gpu=name --format=csv,noheader,nounits | head -1
+}
+
+map_gpu_to_arch() {
+    local name="$1"
+    case "$name" in
+    *"RTX 50"* | *"5090"* | *"5080"* | *"5070"*) echo "12.0" ;;
+    *"H100"* | *"H800"*) echo "9.0" ;;
+    *"RTX 40"* | *"4090"* | *"4080"* | *"4070"* | *"4060"*) echo "8.9" ;;
+    *"RTX 30"* | *"3090"* | *"3080"* | *"3070"* | *"3060"*) echo "8.6" ;;
+    *"A100"* | *"A800"* | *"A40"*) echo "8.0" ;;
+    *"Tesla V100"*) echo "7.0" ;;
+    *"RTX 20"* | *"2080"* | *"2070"* | *"2060"* | *"Titan RTX"*) echo "7.5" ;;
+    *"GTX 16"* | *"1660"* | *"1650"*) echo "7.5" ;;
+    *"GTX 10"* | *"1080"* | *"1070"* | *"1060"* | *"Tesla P100"*) echo "6.1" ;;
+    *"Tesla K80"* | *"Tesla K40"*) echo "3.7" ;;
+    *)
+        echo "❌ Unknown GPU model: $name"
+        echo "Please update the map_gpu_to_arch function for this model."
+        exit 1
+        ;;
+    esac
+}
+
+get_gpu_vram() {
+    install_nvidia_smi_if_missing
+    # Get VRAM in MB, convert to GB
+    local vram_mb=$(nvidia-smi --query-gpu=memory.total --format=csv,noheader,nounits | head -1)
+    echo $((vram_mb / 1024))
+}
+
+map_gpu_to_profile() {
+    local name="$1"
+    local vram_gb="$2"
+
+    # WanGP Profile descriptions from the actual UI:
+    # Profile 1: HighRAM_HighVRAM - 48GB+ RAM, 24GB+ VRAM (fastest for short videos, RTX 3090/4090)
+    # Profile 2: HighRAM_LowVRAM - 48GB+ RAM, 12GB+ VRAM (recommended, most versatile)
+    # Profile 3: LowRAM_HighVRAM - 32GB+ RAM, 24GB+ VRAM (RTX 3090/4090 with limited RAM)
+    # Profile 4: LowRAM_LowVRAM - 32GB+ RAM, 12GB+ VRAM (default, little VRAM or longer videos)
+    # Profile 5: VerylowRAM_LowVRAM - 16GB+ RAM, 10GB+ VRAM (fail safe, slow but works)
+
+    case "$name" in
+    # High-end data center GPUs with 24GB+ VRAM - Profile 1 (HighRAM_HighVRAM)
+    *"RTX 50"* | *"5090"* | *"A100"* | *"A800"* | *"H100"* | *"H800"*)
+        if [ "$vram_gb" -ge 24 ]; then
+            echo "1" # HighRAM_HighVRAM - fastest for short videos
+        else
+            echo "2" # HighRAM_LowVRAM - most versatile
+        fi
+        ;;
+    # High-end consumer GPUs (RTX 3090/4090) - Profile 1 or 3
+    *"RTX 40"* | *"4090"* | *"RTX 30"* | *"3090"*)
+        if [ "$vram_gb" -ge 24 ]; then
+            echo "3" # LowRAM_HighVRAM - good for limited RAM systems
+        else
+            echo "2" # HighRAM_LowVRAM - most versatile
+        fi
+        ;;
+    # Mid-range GPUs (RTX 3070/3080/4070/4080) - Profile 2 recommended
+    *"4080"* | *"4070"* | *"3080"* | *"3070"* | *"RTX 20"* | *"2080"* | *"2070"*)
+        if [ "$vram_gb" -ge 12 ]; then
+            echo "2" # HighRAM_LowVRAM - recommended for these GPUs
+        else
+            echo "4" # LowRAM_LowVRAM - default for little VRAM
+        fi
+        ;;
+    # Lower-end GPUs with 6-12GB VRAM - Profile 4 or 5
+    *"4060"* | *"3060"* | *"2060"* | *"GTX 16"* | *"1660"* | *"1650"*)
+        if [ "$vram_gb" -ge 10 ]; then
+            echo "4" # LowRAM_LowVRAM - default
+        else
+            echo "5" # VerylowRAM_LowVRAM - fail safe
+        fi
+        ;;
+    # Older/lower VRAM GPUs - Profile 5 (fail safe)
+    *"GTX 10"* | *"1080"* | *"1070"* | *"1060"* | *"Tesla"*)
+        echo "5" # VerylowRAM_LowVRAM - fail safe
+        ;;
+    *)
+        echo "4" # LowRAM_LowVRAM - default fallback
+        ;;
+    esac
+}
+
+# ───────────────────────── main ────────────────────────────
+
+echo "🔧 NVIDIA CUDA Setup Check:"
+
+# NVIDIA driver check
+if command -v nvidia-smi &>/dev/null; then
+    DRIVER_VERSION=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader,nounits | head -1)
+    echo "✅ NVIDIA Driver: $DRIVER_VERSION"
+    
+    # Quick CUDA 12.4 compatibility check
+    if [[ "$DRIVER_VERSION" =~ ^([0-9]+) ]]; then
+        MAJOR=${BASH_REMATCH[1]}
+        if [ "$MAJOR" -lt 520 ]; then
+            echo "⚠️  Driver $DRIVER_VERSION may not support CUDA 12.4 (need 520+)"
+        fi
+    fi
+else
+    echo "❌ nvidia-smi not found - no NVIDIA drivers"
+    exit 1
+fi
+
+GPU_NAME=$(detect_gpu_name)
+echo "🔍 Detected GPU: $GPU_NAME"
+
+VRAM_GB=$(get_gpu_vram)
+echo "🧠 Detected VRAM: ${VRAM_GB}GB"
+
+CUDA_ARCH=$(map_gpu_to_arch "$GPU_NAME")
+echo "🚀 Using CUDA architecture: $CUDA_ARCH"
+
+PROFILE=$(map_gpu_to_profile "$GPU_NAME" "$VRAM_GB")
+echo "⚙️  Selected profile: $PROFILE"
+
+docker build --build-arg CUDA_ARCHITECTURES="$CUDA_ARCH" -t deepbeepmeep/wan2gp .
+
+# sudo helper for later commands
+if [ "$EUID" -ne 0 ]; then
+    SUDO='sudo'
+else
+    SUDO=''
+fi
+
+# Ensure NVIDIA runtime is available
+if ! docker info 2>/dev/null | grep -q 'Runtimes:.*nvidia'; then
+    echo "⚠️  NVIDIA Docker runtime not found. Installing nvidia-docker2…"
+    $SUDO apt-get update
+    $SUDO apt-get install -y curl ca-certificates gnupg
+    curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | $SUDO apt-key add -
+    distribution=$(
+        . /etc/os-release
+        echo $ID$VERSION_ID
+    )
+    curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list |
+        $SUDO tee /etc/apt/sources.list.d/nvidia-docker.list
+    $SUDO apt-get update
+    $SUDO apt-get install -y nvidia-docker2
+    echo "🔄 Restarting Docker service…"
+    $SUDO systemctl restart docker
+    echo "✅ NVIDIA Docker runtime installed."
+else
+    echo "✅ NVIDIA Docker runtime found."
+fi
+
+# Quick NVIDIA runtime test
+echo "🧪 Testing NVIDIA runtime..."
+if timeout 15s docker run --rm --gpus all --runtime=nvidia nvidia/cuda:12.4-runtime-ubuntu22.04 nvidia-smi >/dev/null 2>&1; then
+    echo "✅ NVIDIA runtime working"
+else
+    echo "❌ NVIDIA runtime test failed - check driver/runtime compatibility"
+fi
+
+# Prepare cache dirs & volume mounts
+cache_dirs=(numba matplotlib huggingface torch)
+cache_mounts=()
+for d in "${cache_dirs[@]}"; do
+    mkdir -p "$HOME/.cache/$d"
+    chmod 700 "$HOME/.cache/$d"
+    cache_mounts+=(-v "$HOME/.cache/$d:/home/user/.cache/$d")
+done
+
+echo "🔧 Optimization settings:"
+echo "   Profile: $PROFILE"
+
+# Run the container
+docker run --rm -it \
+    --name wan2gp \
+    --gpus all \
+    --runtime=nvidia \
+    -p 7860:7860 \
+    -v "$(pwd):/workspace" \
+    "${cache_mounts[@]}" \
+    deepbeepmeep/wan2gp \
+    --profile "$PROFILE" \
+    --attention sage \
+    --compile \
+    --perc-reserved-mem-max 1