diff --git a/entrypoint.sh b/entrypoint.sh index 4c363ab..9af052d 100755 --- a/entrypoint.sh +++ b/entrypoint.sh @@ -15,4 +15,104 @@ export TORCH_ALLOW_TF32_CUDNN=1 export SDL_AUDIODRIVER=dummy export PULSE_RUNTIME_PATH=/tmp/pulse-runtime +# ═══════════════════════════ CUDA DEBUG CHECKS ═══════════════════════════ + +echo "🔍 CUDA Environment Debug Information:" +echo "═══════════════════════════════════════════════════════════════════════" + +# Check CUDA driver on host (if accessible) +if command -v nvidia-smi >/dev/null 2>&1; then + echo "✅ nvidia-smi available" + echo "📊 GPU Information:" + nvidia-smi --query-gpu=name,driver_version,memory.total,memory.free --format=csv,noheader,nounits 2>/dev/null || echo "❌ nvidia-smi failed to query GPU" + echo "🏃 Running Processes:" + nvidia-smi --query-compute-apps=pid,name,used_memory --format=csv,noheader,nounits 2>/dev/null || echo "ℹ️ No running CUDA processes" +else + echo "❌ nvidia-smi not available in container" +fi + +# Check CUDA runtime libraries +echo "" +echo "🔧 CUDA Runtime Check:" +if ls /usr/local/cuda*/lib*/libcudart.so* >/dev/null 2>&1; then + echo "✅ CUDA runtime libraries found:" + ls /usr/local/cuda*/lib*/libcudart.so* 2>/dev/null +else + echo "❌ CUDA runtime libraries not found" +fi + +# Check CUDA devices +echo "" +echo "🖥️ CUDA Device Files:" +if ls /dev/nvidia* >/dev/null 2>&1; then + echo "✅ NVIDIA device files found:" + ls -la /dev/nvidia* 2>/dev/null +else + echo "❌ No NVIDIA device files found - Docker may not have GPU access" +fi + +# Check CUDA environment variables +echo "" +echo "🌍 CUDA Environment Variables:" +echo " CUDA_HOME: ${CUDA_HOME:-not set}" +echo " CUDA_ROOT: ${CUDA_ROOT:-not set}" +echo " CUDA_PATH: ${CUDA_PATH:-not set}" +echo " LD_LIBRARY_PATH: ${LD_LIBRARY_PATH:-not set}" +echo " TORCH_CUDA_ARCH_LIST: ${TORCH_CUDA_ARCH_LIST:-not set}" +echo " CUDA_VISIBLE_DEVICES: ${CUDA_VISIBLE_DEVICES:-not set}" + +# Check PyTorch CUDA availability +echo "" +echo "🐍 PyTorch CUDA Check:" +python3 -c " +import sys +try: + import torch + print('✅ PyTorch imported successfully') + print(f' Version: {torch.__version__}') + print(f' CUDA available: {torch.cuda.is_available()}') + if torch.cuda.is_available(): + print(f' CUDA version: {torch.version.cuda}') + print(f' cuDNN version: {torch.backends.cudnn.version()}') + print(f' Device count: {torch.cuda.device_count()}') + for i in range(torch.cuda.device_count()): + props = torch.cuda.get_device_properties(i) + print(f' Device {i}: {props.name} (SM {props.major}.{props.minor}, {props.total_memory//1024//1024}MB)') + else: + print('❌ CUDA not available to PyTorch') + print(' This could mean:') + print(' - CUDA runtime not properly installed') + print(' - GPU not accessible to container') + print(' - Driver/runtime version mismatch') +except ImportError as e: + print(f'❌ Failed to import PyTorch: {e}') +except Exception as e: + print(f'❌ PyTorch CUDA check failed: {e}') +" 2>&1 + +# Check for common CUDA issues +echo "" +echo "🩺 Common Issue Diagnostics:" + +# Check if running with proper Docker flags +if [ ! -e /dev/nvidia0 ] && [ ! -e /dev/nvidiactl ]; then + echo "❌ No NVIDIA device nodes - container likely missing --gpus all or --runtime=nvidia" +fi + +# Check CUDA library paths +if [ -z "$LD_LIBRARY_PATH" ] || ! echo "$LD_LIBRARY_PATH" | grep -q cuda; then + echo "⚠️ LD_LIBRARY_PATH may not include CUDA libraries" +fi + +# Check permissions on device files +if ls /dev/nvidia* >/dev/null 2>&1; then + if ! ls -la /dev/nvidia* | grep -q "rw-rw-rw-\|rw-r--r--"; then + echo "⚠️ NVIDIA device files may have restrictive permissions" + fi +fi + +echo "═══════════════════════════════════════════════════════════════════════" +echo "🚀 Starting application..." +echo "" + exec su -p user -c "python3 wgp.py --listen $*" diff --git a/requirements.txt b/requirements.txt index ea0c582..312dc94 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,13 +12,13 @@ easydict ftfy dashscope imageio-ffmpeg -# flash_attn -gradio==5.23.0 +# flash_attn +gradio==5.23.0 numpy>=1.23.5,<2 einops moviepy==1.0.3 mmgp==3.5.1 -peft==0.15.0 +peft==0.17.0 mutagen pydantic==2.10.6 decord @@ -46,4 +46,4 @@ soundfile ffmpeg-python pyannote.audio # num2words -# spacy \ No newline at end of file +# spacy diff --git a/run-docker-cuda-deb.sh b/run-docker-cuda-deb.sh index 1a6201a..b35e9cc 100755 --- a/run-docker-cuda-deb.sh +++ b/run-docker-cuda-deb.sh @@ -114,6 +114,25 @@ map_gpu_to_profile() { # ───────────────────────── main ──────────────────────────── +echo "🔧 NVIDIA CUDA Setup Check:" + +# NVIDIA driver check +if command -v nvidia-smi &>/dev/null; then + DRIVER_VERSION=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader,nounits | head -1) + echo "✅ NVIDIA Driver: $DRIVER_VERSION" + + # Quick CUDA 12.4 compatibility check + if [[ "$DRIVER_VERSION" =~ ^([0-9]+) ]]; then + MAJOR=${BASH_REMATCH[1]} + if [ "$MAJOR" -lt 520 ]; then + echo "⚠️ Driver $DRIVER_VERSION may not support CUDA 12.4 (need 520+)" + fi + fi +else + echo "❌ nvidia-smi not found - no NVIDIA drivers" + exit 1 +fi + GPU_NAME=$(detect_gpu_name) echo "🔍 Detected GPU: $GPU_NAME" @@ -156,6 +175,14 @@ else echo "✅ NVIDIA Docker runtime found." fi +# Quick NVIDIA runtime test +echo "🧪 Testing NVIDIA runtime..." +if timeout 15s docker run --rm --gpus all --runtime=nvidia nvidia/cuda:12.4-runtime-ubuntu22.04 nvidia-smi >/dev/null 2>&1; then + echo "✅ NVIDIA runtime working" +else + echo "❌ NVIDIA runtime test failed - check driver/runtime compatibility" +fi + # Prepare cache dirs & volume mounts cache_dirs=(numba matplotlib huggingface torch) cache_mounts=()