mirror of
				https://github.com/Wan-Video/Wan2.1.git
				synced 2025-11-04 06:15:17 +00:00 
			
		
		
		
	add more checks + logging + upgrade peft
This commit is contained in:
		
							parent
							
								
									dedcc577a4
								
							
						
					
					
						commit
						3307defa7c
					
				
							
								
								
									
										100
									
								
								entrypoint.sh
									
									
									
									
									
								
							
							
						
						
									
										100
									
								
								entrypoint.sh
									
									
									
									
									
								
							@ -15,4 +15,104 @@ export TORCH_ALLOW_TF32_CUDNN=1
 | 
				
			|||||||
export SDL_AUDIODRIVER=dummy
 | 
					export SDL_AUDIODRIVER=dummy
 | 
				
			||||||
export PULSE_RUNTIME_PATH=/tmp/pulse-runtime
 | 
					export PULSE_RUNTIME_PATH=/tmp/pulse-runtime
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# ═══════════════════════════ CUDA DEBUG CHECKS ═══════════════════════════
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					echo "🔍 CUDA Environment Debug Information:"
 | 
				
			||||||
 | 
					echo "═══════════════════════════════════════════════════════════════════════"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# Check CUDA driver on host (if accessible)
 | 
				
			||||||
 | 
					if command -v nvidia-smi >/dev/null 2>&1; then
 | 
				
			||||||
 | 
					    echo "✅ nvidia-smi available"
 | 
				
			||||||
 | 
					    echo "📊 GPU Information:"
 | 
				
			||||||
 | 
					    nvidia-smi --query-gpu=name,driver_version,memory.total,memory.free --format=csv,noheader,nounits 2>/dev/null || echo "❌ nvidia-smi failed to query GPU"
 | 
				
			||||||
 | 
					    echo "🏃 Running Processes:"
 | 
				
			||||||
 | 
					    nvidia-smi --query-compute-apps=pid,name,used_memory --format=csv,noheader,nounits 2>/dev/null || echo "ℹ️  No running CUDA processes"
 | 
				
			||||||
 | 
					else
 | 
				
			||||||
 | 
					    echo "❌ nvidia-smi not available in container"
 | 
				
			||||||
 | 
					fi
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# Check CUDA runtime libraries
 | 
				
			||||||
 | 
					echo ""
 | 
				
			||||||
 | 
					echo "🔧 CUDA Runtime Check:"
 | 
				
			||||||
 | 
					if ls /usr/local/cuda*/lib*/libcudart.so* >/dev/null 2>&1; then
 | 
				
			||||||
 | 
					    echo "✅ CUDA runtime libraries found:"
 | 
				
			||||||
 | 
					    ls /usr/local/cuda*/lib*/libcudart.so* 2>/dev/null
 | 
				
			||||||
 | 
					else
 | 
				
			||||||
 | 
					    echo "❌ CUDA runtime libraries not found"
 | 
				
			||||||
 | 
					fi
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# Check CUDA devices
 | 
				
			||||||
 | 
					echo ""
 | 
				
			||||||
 | 
					echo "🖥️  CUDA Device Files:"
 | 
				
			||||||
 | 
					if ls /dev/nvidia* >/dev/null 2>&1; then
 | 
				
			||||||
 | 
					    echo "✅ NVIDIA device files found:"
 | 
				
			||||||
 | 
					    ls -la /dev/nvidia* 2>/dev/null
 | 
				
			||||||
 | 
					else
 | 
				
			||||||
 | 
					    echo "❌ No NVIDIA device files found - Docker may not have GPU access"
 | 
				
			||||||
 | 
					fi
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# Check CUDA environment variables
 | 
				
			||||||
 | 
					echo ""
 | 
				
			||||||
 | 
					echo "🌍 CUDA Environment Variables:"
 | 
				
			||||||
 | 
					echo "   CUDA_HOME: ${CUDA_HOME:-not set}"
 | 
				
			||||||
 | 
					echo "   CUDA_ROOT: ${CUDA_ROOT:-not set}"
 | 
				
			||||||
 | 
					echo "   CUDA_PATH: ${CUDA_PATH:-not set}"
 | 
				
			||||||
 | 
					echo "   LD_LIBRARY_PATH: ${LD_LIBRARY_PATH:-not set}"
 | 
				
			||||||
 | 
					echo "   TORCH_CUDA_ARCH_LIST: ${TORCH_CUDA_ARCH_LIST:-not set}"
 | 
				
			||||||
 | 
					echo "   CUDA_VISIBLE_DEVICES: ${CUDA_VISIBLE_DEVICES:-not set}"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# Check PyTorch CUDA availability
 | 
				
			||||||
 | 
					echo ""
 | 
				
			||||||
 | 
					echo "🐍 PyTorch CUDA Check:"
 | 
				
			||||||
 | 
					python3 -c "
 | 
				
			||||||
 | 
					import sys
 | 
				
			||||||
 | 
					try:
 | 
				
			||||||
 | 
					    import torch
 | 
				
			||||||
 | 
					    print('✅ PyTorch imported successfully')
 | 
				
			||||||
 | 
					    print(f'   Version: {torch.__version__}')
 | 
				
			||||||
 | 
					    print(f'   CUDA available: {torch.cuda.is_available()}')
 | 
				
			||||||
 | 
					    if torch.cuda.is_available():
 | 
				
			||||||
 | 
					        print(f'   CUDA version: {torch.version.cuda}')
 | 
				
			||||||
 | 
					        print(f'   cuDNN version: {torch.backends.cudnn.version()}')
 | 
				
			||||||
 | 
					        print(f'   Device count: {torch.cuda.device_count()}')
 | 
				
			||||||
 | 
					        for i in range(torch.cuda.device_count()):
 | 
				
			||||||
 | 
					            props = torch.cuda.get_device_properties(i)
 | 
				
			||||||
 | 
					            print(f'   Device {i}: {props.name} (SM {props.major}.{props.minor}, {props.total_memory//1024//1024}MB)')
 | 
				
			||||||
 | 
					    else:
 | 
				
			||||||
 | 
					        print('❌ CUDA not available to PyTorch')
 | 
				
			||||||
 | 
					        print('   This could mean:')
 | 
				
			||||||
 | 
					        print('   - CUDA runtime not properly installed')
 | 
				
			||||||
 | 
					        print('   - GPU not accessible to container')
 | 
				
			||||||
 | 
					        print('   - Driver/runtime version mismatch')
 | 
				
			||||||
 | 
					except ImportError as e:
 | 
				
			||||||
 | 
					    print(f'❌ Failed to import PyTorch: {e}')
 | 
				
			||||||
 | 
					except Exception as e:
 | 
				
			||||||
 | 
					    print(f'❌ PyTorch CUDA check failed: {e}')
 | 
				
			||||||
 | 
					" 2>&1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# Check for common CUDA issues
 | 
				
			||||||
 | 
					echo ""
 | 
				
			||||||
 | 
					echo "🩺 Common Issue Diagnostics:"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# Check if running with proper Docker flags
 | 
				
			||||||
 | 
					if [ ! -e /dev/nvidia0 ] && [ ! -e /dev/nvidiactl ]; then
 | 
				
			||||||
 | 
					    echo "❌ No NVIDIA device nodes - container likely missing --gpus all or --runtime=nvidia"
 | 
				
			||||||
 | 
					fi
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# Check CUDA library paths
 | 
				
			||||||
 | 
					if [ -z "$LD_LIBRARY_PATH" ] || ! echo "$LD_LIBRARY_PATH" | grep -q cuda; then
 | 
				
			||||||
 | 
					    echo "⚠️  LD_LIBRARY_PATH may not include CUDA libraries"
 | 
				
			||||||
 | 
					fi
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# Check permissions on device files
 | 
				
			||||||
 | 
					if ls /dev/nvidia* >/dev/null 2>&1; then
 | 
				
			||||||
 | 
					    if ! ls -la /dev/nvidia* | grep -q "rw-rw-rw-\|rw-r--r--"; then
 | 
				
			||||||
 | 
					        echo "⚠️  NVIDIA device files may have restrictive permissions"
 | 
				
			||||||
 | 
					    fi
 | 
				
			||||||
 | 
					fi
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					echo "═══════════════════════════════════════════════════════════════════════"
 | 
				
			||||||
 | 
					echo "🚀 Starting application..."
 | 
				
			||||||
 | 
					echo ""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
exec su -p user -c "python3 wgp.py --listen $*"
 | 
					exec su -p user -c "python3 wgp.py --listen $*"
 | 
				
			||||||
 | 
				
			|||||||
@ -12,13 +12,13 @@ easydict
 | 
				
			|||||||
ftfy
 | 
					ftfy
 | 
				
			||||||
dashscope
 | 
					dashscope
 | 
				
			||||||
imageio-ffmpeg
 | 
					imageio-ffmpeg
 | 
				
			||||||
# flash_attn    
 | 
					# flash_attn
 | 
				
			||||||
gradio==5.23.0   
 | 
					gradio==5.23.0
 | 
				
			||||||
numpy>=1.23.5,<2
 | 
					numpy>=1.23.5,<2
 | 
				
			||||||
einops
 | 
					einops
 | 
				
			||||||
moviepy==1.0.3
 | 
					moviepy==1.0.3
 | 
				
			||||||
mmgp==3.5.1
 | 
					mmgp==3.5.1
 | 
				
			||||||
peft==0.15.0
 | 
					peft==0.17.0
 | 
				
			||||||
mutagen
 | 
					mutagen
 | 
				
			||||||
pydantic==2.10.6
 | 
					pydantic==2.10.6
 | 
				
			||||||
decord
 | 
					decord
 | 
				
			||||||
@ -46,4 +46,4 @@ soundfile
 | 
				
			|||||||
ffmpeg-python
 | 
					ffmpeg-python
 | 
				
			||||||
pyannote.audio
 | 
					pyannote.audio
 | 
				
			||||||
# num2words
 | 
					# num2words
 | 
				
			||||||
# spacy
 | 
					# spacy
 | 
				
			||||||
 | 
				
			|||||||
@ -114,6 +114,25 @@ map_gpu_to_profile() {
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
# ───────────────────────── main ────────────────────────────
 | 
					# ───────────────────────── main ────────────────────────────
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					echo "🔧 NVIDIA CUDA Setup Check:"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# NVIDIA driver check
 | 
				
			||||||
 | 
					if command -v nvidia-smi &>/dev/null; then
 | 
				
			||||||
 | 
					    DRIVER_VERSION=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader,nounits | head -1)
 | 
				
			||||||
 | 
					    echo "✅ NVIDIA Driver: $DRIVER_VERSION"
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    # Quick CUDA 12.4 compatibility check
 | 
				
			||||||
 | 
					    if [[ "$DRIVER_VERSION" =~ ^([0-9]+) ]]; then
 | 
				
			||||||
 | 
					        MAJOR=${BASH_REMATCH[1]}
 | 
				
			||||||
 | 
					        if [ "$MAJOR" -lt 520 ]; then
 | 
				
			||||||
 | 
					            echo "⚠️  Driver $DRIVER_VERSION may not support CUDA 12.4 (need 520+)"
 | 
				
			||||||
 | 
					        fi
 | 
				
			||||||
 | 
					    fi
 | 
				
			||||||
 | 
					else
 | 
				
			||||||
 | 
					    echo "❌ nvidia-smi not found - no NVIDIA drivers"
 | 
				
			||||||
 | 
					    exit 1
 | 
				
			||||||
 | 
					fi
 | 
				
			||||||
 | 
					
 | 
				
			||||||
GPU_NAME=$(detect_gpu_name)
 | 
					GPU_NAME=$(detect_gpu_name)
 | 
				
			||||||
echo "🔍 Detected GPU: $GPU_NAME"
 | 
					echo "🔍 Detected GPU: $GPU_NAME"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -156,6 +175,14 @@ else
 | 
				
			|||||||
    echo "✅ NVIDIA Docker runtime found."
 | 
					    echo "✅ NVIDIA Docker runtime found."
 | 
				
			||||||
fi
 | 
					fi
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# Quick NVIDIA runtime test
 | 
				
			||||||
 | 
					echo "🧪 Testing NVIDIA runtime..."
 | 
				
			||||||
 | 
					if timeout 15s docker run --rm --gpus all --runtime=nvidia nvidia/cuda:12.4-runtime-ubuntu22.04 nvidia-smi >/dev/null 2>&1; then
 | 
				
			||||||
 | 
					    echo "✅ NVIDIA runtime working"
 | 
				
			||||||
 | 
					else
 | 
				
			||||||
 | 
					    echo "❌ NVIDIA runtime test failed - check driver/runtime compatibility"
 | 
				
			||||||
 | 
					fi
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Prepare cache dirs & volume mounts
 | 
					# Prepare cache dirs & volume mounts
 | 
				
			||||||
cache_dirs=(numba matplotlib huggingface torch)
 | 
					cache_dirs=(numba matplotlib huggingface torch)
 | 
				
			||||||
cache_mounts=()
 | 
					cache_mounts=()
 | 
				
			||||||
 | 
				
			|||||||
		Loading…
	
		Reference in New Issue
	
	Block a user