mirror of
https://github.com/Wan-Video/Wan2.1.git
synced 2025-11-04 14:16:57 +00:00
add more checks + logging + upgrade peft
This commit is contained in:
parent
dedcc577a4
commit
3307defa7c
100
entrypoint.sh
100
entrypoint.sh
@ -15,4 +15,104 @@ export TORCH_ALLOW_TF32_CUDNN=1
|
|||||||
export SDL_AUDIODRIVER=dummy
|
export SDL_AUDIODRIVER=dummy
|
||||||
export PULSE_RUNTIME_PATH=/tmp/pulse-runtime
|
export PULSE_RUNTIME_PATH=/tmp/pulse-runtime
|
||||||
|
|
||||||
|
# ═══════════════════════════ CUDA DEBUG CHECKS ═══════════════════════════
|
||||||
|
|
||||||
|
echo "🔍 CUDA Environment Debug Information:"
|
||||||
|
echo "═══════════════════════════════════════════════════════════════════════"
|
||||||
|
|
||||||
|
# Check CUDA driver on host (if accessible)
|
||||||
|
if command -v nvidia-smi >/dev/null 2>&1; then
|
||||||
|
echo "✅ nvidia-smi available"
|
||||||
|
echo "📊 GPU Information:"
|
||||||
|
nvidia-smi --query-gpu=name,driver_version,memory.total,memory.free --format=csv,noheader,nounits 2>/dev/null || echo "❌ nvidia-smi failed to query GPU"
|
||||||
|
echo "🏃 Running Processes:"
|
||||||
|
nvidia-smi --query-compute-apps=pid,name,used_memory --format=csv,noheader,nounits 2>/dev/null || echo "ℹ️ No running CUDA processes"
|
||||||
|
else
|
||||||
|
echo "❌ nvidia-smi not available in container"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Check CUDA runtime libraries
|
||||||
|
echo ""
|
||||||
|
echo "🔧 CUDA Runtime Check:"
|
||||||
|
if ls /usr/local/cuda*/lib*/libcudart.so* >/dev/null 2>&1; then
|
||||||
|
echo "✅ CUDA runtime libraries found:"
|
||||||
|
ls /usr/local/cuda*/lib*/libcudart.so* 2>/dev/null
|
||||||
|
else
|
||||||
|
echo "❌ CUDA runtime libraries not found"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Check CUDA devices
|
||||||
|
echo ""
|
||||||
|
echo "🖥️ CUDA Device Files:"
|
||||||
|
if ls /dev/nvidia* >/dev/null 2>&1; then
|
||||||
|
echo "✅ NVIDIA device files found:"
|
||||||
|
ls -la /dev/nvidia* 2>/dev/null
|
||||||
|
else
|
||||||
|
echo "❌ No NVIDIA device files found - Docker may not have GPU access"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Check CUDA environment variables
|
||||||
|
echo ""
|
||||||
|
echo "🌍 CUDA Environment Variables:"
|
||||||
|
echo " CUDA_HOME: ${CUDA_HOME:-not set}"
|
||||||
|
echo " CUDA_ROOT: ${CUDA_ROOT:-not set}"
|
||||||
|
echo " CUDA_PATH: ${CUDA_PATH:-not set}"
|
||||||
|
echo " LD_LIBRARY_PATH: ${LD_LIBRARY_PATH:-not set}"
|
||||||
|
echo " TORCH_CUDA_ARCH_LIST: ${TORCH_CUDA_ARCH_LIST:-not set}"
|
||||||
|
echo " CUDA_VISIBLE_DEVICES: ${CUDA_VISIBLE_DEVICES:-not set}"
|
||||||
|
|
||||||
|
# Check PyTorch CUDA availability
|
||||||
|
echo ""
|
||||||
|
echo "🐍 PyTorch CUDA Check:"
|
||||||
|
python3 -c "
|
||||||
|
import sys
|
||||||
|
try:
|
||||||
|
import torch
|
||||||
|
print('✅ PyTorch imported successfully')
|
||||||
|
print(f' Version: {torch.__version__}')
|
||||||
|
print(f' CUDA available: {torch.cuda.is_available()}')
|
||||||
|
if torch.cuda.is_available():
|
||||||
|
print(f' CUDA version: {torch.version.cuda}')
|
||||||
|
print(f' cuDNN version: {torch.backends.cudnn.version()}')
|
||||||
|
print(f' Device count: {torch.cuda.device_count()}')
|
||||||
|
for i in range(torch.cuda.device_count()):
|
||||||
|
props = torch.cuda.get_device_properties(i)
|
||||||
|
print(f' Device {i}: {props.name} (SM {props.major}.{props.minor}, {props.total_memory//1024//1024}MB)')
|
||||||
|
else:
|
||||||
|
print('❌ CUDA not available to PyTorch')
|
||||||
|
print(' This could mean:')
|
||||||
|
print(' - CUDA runtime not properly installed')
|
||||||
|
print(' - GPU not accessible to container')
|
||||||
|
print(' - Driver/runtime version mismatch')
|
||||||
|
except ImportError as e:
|
||||||
|
print(f'❌ Failed to import PyTorch: {e}')
|
||||||
|
except Exception as e:
|
||||||
|
print(f'❌ PyTorch CUDA check failed: {e}')
|
||||||
|
" 2>&1
|
||||||
|
|
||||||
|
# Check for common CUDA issues
|
||||||
|
echo ""
|
||||||
|
echo "🩺 Common Issue Diagnostics:"
|
||||||
|
|
||||||
|
# Check if running with proper Docker flags
|
||||||
|
if [ ! -e /dev/nvidia0 ] && [ ! -e /dev/nvidiactl ]; then
|
||||||
|
echo "❌ No NVIDIA device nodes - container likely missing --gpus all or --runtime=nvidia"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Check CUDA library paths
|
||||||
|
if [ -z "$LD_LIBRARY_PATH" ] || ! echo "$LD_LIBRARY_PATH" | grep -q cuda; then
|
||||||
|
echo "⚠️ LD_LIBRARY_PATH may not include CUDA libraries"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Check permissions on device files
|
||||||
|
if ls /dev/nvidia* >/dev/null 2>&1; then
|
||||||
|
if ! ls -la /dev/nvidia* | grep -q "rw-rw-rw-\|rw-r--r--"; then
|
||||||
|
echo "⚠️ NVIDIA device files may have restrictive permissions"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "═══════════════════════════════════════════════════════════════════════"
|
||||||
|
echo "🚀 Starting application..."
|
||||||
|
echo ""
|
||||||
|
|
||||||
exec su -p user -c "python3 wgp.py --listen $*"
|
exec su -p user -c "python3 wgp.py --listen $*"
|
||||||
|
|||||||
@ -18,7 +18,7 @@ numpy>=1.23.5,<2
|
|||||||
einops
|
einops
|
||||||
moviepy==1.0.3
|
moviepy==1.0.3
|
||||||
mmgp==3.5.1
|
mmgp==3.5.1
|
||||||
peft==0.15.0
|
peft==0.17.0
|
||||||
mutagen
|
mutagen
|
||||||
pydantic==2.10.6
|
pydantic==2.10.6
|
||||||
decord
|
decord
|
||||||
|
|||||||
@ -114,6 +114,25 @@ map_gpu_to_profile() {
|
|||||||
|
|
||||||
# ───────────────────────── main ────────────────────────────
|
# ───────────────────────── main ────────────────────────────
|
||||||
|
|
||||||
|
echo "🔧 NVIDIA CUDA Setup Check:"
|
||||||
|
|
||||||
|
# NVIDIA driver check
|
||||||
|
if command -v nvidia-smi &>/dev/null; then
|
||||||
|
DRIVER_VERSION=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader,nounits | head -1)
|
||||||
|
echo "✅ NVIDIA Driver: $DRIVER_VERSION"
|
||||||
|
|
||||||
|
# Quick CUDA 12.4 compatibility check
|
||||||
|
if [[ "$DRIVER_VERSION" =~ ^([0-9]+) ]]; then
|
||||||
|
MAJOR=${BASH_REMATCH[1]}
|
||||||
|
if [ "$MAJOR" -lt 520 ]; then
|
||||||
|
echo "⚠️ Driver $DRIVER_VERSION may not support CUDA 12.4 (need 520+)"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
echo "❌ nvidia-smi not found - no NVIDIA drivers"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
GPU_NAME=$(detect_gpu_name)
|
GPU_NAME=$(detect_gpu_name)
|
||||||
echo "🔍 Detected GPU: $GPU_NAME"
|
echo "🔍 Detected GPU: $GPU_NAME"
|
||||||
|
|
||||||
@ -156,6 +175,14 @@ else
|
|||||||
echo "✅ NVIDIA Docker runtime found."
|
echo "✅ NVIDIA Docker runtime found."
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
# Quick NVIDIA runtime test
|
||||||
|
echo "🧪 Testing NVIDIA runtime..."
|
||||||
|
if timeout 15s docker run --rm --gpus all --runtime=nvidia nvidia/cuda:12.4-runtime-ubuntu22.04 nvidia-smi >/dev/null 2>&1; then
|
||||||
|
echo "✅ NVIDIA runtime working"
|
||||||
|
else
|
||||||
|
echo "❌ NVIDIA runtime test failed - check driver/runtime compatibility"
|
||||||
|
fi
|
||||||
|
|
||||||
# Prepare cache dirs & volume mounts
|
# Prepare cache dirs & volume mounts
|
||||||
cache_dirs=(numba matplotlib huggingface torch)
|
cache_dirs=(numba matplotlib huggingface torch)
|
||||||
cache_mounts=()
|
cache_mounts=()
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user