This commit is contained in:
DeepBeepMeep 2025-07-09 10:21:18 +02:00
parent 6a28bb8d4d
commit 8d2164aaf1
4 changed files with 138 additions and 26 deletions

View File

@ -11,7 +11,7 @@ WanGP supports the Wan (and derived models), Hunyuan Video and LTV Video models
- Very Fast on the latest GPUs - Very Fast on the latest GPUs
- Easy to use Full Web based interface - Easy to use Full Web based interface
- Auto download of the required model adapted to your specific architecture - Auto download of the required model adapted to your specific architecture
- Tools integrated to facilitate Video Generation : Mask Editor, Prompt Enhancer, Temporal and Spatial Generation, MMAudio, Vew - Tools integrated to facilitate Video Generation : Mask Editor, Prompt Enhancer, Temporal and Spatial Generation, MMAudio, Video Browser, Pose / Depth / Flow extractor
- Loras Support to customize each model - Loras Support to customize each model
- Queuing system : make your shopping list of videos to generate and come back later - Queuing system : make your shopping list of videos to generate and come back later
@ -27,7 +27,7 @@ Of course you will get as well *Multitalk* vanilla and also *Multitalk 720p* as
And since I am mister nice guy I have enclosed as an exclusivity an *Audio Separator* that will save you time to isolate each voice when using Multitalk with two people. And since I am mister nice guy I have enclosed as an exclusivity an *Audio Separator* that will save you time to isolate each voice when using Multitalk with two people.
As I feel like a resting a bit I haven't produced a nice sample Video to illustrate all these new capabilities. But here is the thing, I ams sure you will publish in the *Share Your Best Video* channel your *Master Pieces*. The best one will be added to the *Announcements Channel* and will bring eternal fame to its author. As I feel like resting a bit I haven't produced yet a nice sample Video to illustrate all these new capabilities. But here is the thing, I ams sure you will publish in the *Share Your Best Video* channel your *Master Pieces*. The best ones will be added to the *Announcements Channel* and will bring eternal fame to its authors.
But wait, there is more: But wait, there is more:
- Sliding Windows support has been added anywhere with Wan models, so imagine with text2video recently upgraded in 6.5 into a video2video, you can now upsample very long videos regardless of your VRAM. The good old image2video model can now reuse the last image to produce new videos (as requested by many of you) - Sliding Windows support has been added anywhere with Wan models, so imagine with text2video recently upgraded in 6.5 into a video2video, you can now upsample very long videos regardless of your VRAM. The good old image2video model can now reuse the last image to produce new videos (as requested by many of you)

View File

@ -117,6 +117,20 @@ class PoseAnnotator:
H, W, C = ori_img.shape H, W, C = ori_img.shape
with torch.no_grad(): with torch.no_grad():
candidate, subset, det_result = self.pose_estimation(ori_img) candidate, subset, det_result = self.pose_estimation(ori_img)
if len(candidate) == 0:
# No detections - return empty results
empty_ret_data = {}
if self.use_body:
empty_ret_data["detected_map_body"] = np.zeros((ori_h, ori_w, 3), dtype=np.uint8)
if self.use_face:
empty_ret_data["detected_map_face"] = np.zeros((ori_h, ori_w, 3), dtype=np.uint8)
if self.use_body and self.use_face:
empty_ret_data["detected_map_bodyface"] = np.zeros((ori_h, ori_w, 3), dtype=np.uint8)
if self.use_hand and self.use_body and self.use_face:
empty_ret_data["detected_map_handbodyface"] = np.zeros((ori_h, ori_w, 3), dtype=np.uint8)
return empty_ret_data, np.array([])
nums, keys, locs = candidate.shape nums, keys, locs = candidate.shape
candidate[..., 0] /= float(W) candidate[..., 0] /= float(W)
candidate[..., 1] /= float(H) candidate[..., 1] /= float(H)
@ -202,17 +216,17 @@ class PoseBodyFaceAnnotator(PoseAnnotator):
class OptimizedPoseBodyFaceVideoAnnotator: class OptimizedPoseBodyFaceVideoAnnotator:
"""Optimized video annotator with multiple optimization strategies""" """Optimized video annotator with multiple optimization strategies"""
def __init__(self, cfg, num_workers=5, chunk_size=8): def __init__(self, cfg, num_workers=2, chunk_size=8):
self.cfg = cfg self.cfg = cfg
self.num_workers = num_workers self.num_workers = num_workers
self.chunk_size = chunk_size self.chunk_size = chunk_size
self.use_body, self.use_face, self.use_hand = True, True, False self.use_body, self.use_face, self.use_hand = True, True, True
# Initialize one annotator per worker to avoid ONNX session conflicts # Initialize one annotator per worker to avoid ONNX session conflicts
self.annotators = [] self.annotators = []
for _ in range(num_workers): for _ in range(num_workers):
annotator = OptimizedPoseAnnotator(cfg) annotator = OptimizedPoseAnnotator(cfg)
annotator.use_body, annotator.use_face, annotator.use_hand = True, True, False annotator.use_body, annotator.use_face, annotator.use_hand = True, True, True
self.annotators.append(annotator) self.annotators.append(annotator)
self._current_worker = 0 self._current_worker = 0
@ -239,8 +253,8 @@ class OptimizedPoseBodyFaceVideoAnnotator:
# Process # Process
ret_data, _ = annotator.process(resized_image, frame.shape[:2]) ret_data, _ = annotator.process(resized_image, frame.shape[:2])
if 'detected_map_bodyface' in ret_data: if 'detected_map_handbodyface' in ret_data:
return frame_idx, ret_data['detected_map_bodyface'] return frame_idx, ret_data['detected_map_handbodyface']
else: else:
# Create empty frame if no detection # Create empty frame if no detection
h, w = frame.shape[:2] h, w = frame.shape[:2]
@ -267,8 +281,8 @@ class OptimizedPoseBodyFaceVideoAnnotator:
resized_image = resize_image(input_image, annotator.resize_size) resized_image = resize_image(input_image, annotator.resize_size)
ret_data, _ = annotator.process(resized_image, frame.shape[:2]) ret_data, _ = annotator.process(resized_image, frame.shape[:2])
if 'detected_map_bodyface' in ret_data: if 'detected_map_handbodyface' in ret_data:
ret_frames.append(ret_data['detected_map_bodyface']) ret_frames.append(ret_data['detected_map_handbodyface'])
else: else:
h, w = frame.shape[:2] h, w = frame.shape[:2]
ret_frames.append(np.zeros((h, w, 3), dtype=np.uint8)) ret_frames.append(np.zeros((h, w, 3), dtype=np.uint8))
@ -293,12 +307,109 @@ class OptimizedPoseBodyFaceVideoAnnotator:
return results return results
# Alias for backward compatibility class OptimizedPoseBodyFaceHandVideoAnnotator:
class PoseBodyFaceVideoAnnotator(OptimizedPoseBodyFaceVideoAnnotator): """Optimized video annotator that includes hands, body, and face"""
"""Backward compatible class name"""
def __init__(self, cfg, num_workers=2, chunk_size=8): def __init__(self, cfg, num_workers=2, chunk_size=8):
# Use optimized version with conservative settings self.cfg = cfg
super().__init__(cfg, num_workers=num_workers, chunk_size=chunk_size) self.num_workers = num_workers
self.chunk_size = chunk_size
self.use_body, self.use_face, self.use_hand = True, True, True # Enable hands
# Initialize one annotator per worker to avoid ONNX session conflicts
self.annotators = []
for _ in range(num_workers):
annotator = OptimizedPoseAnnotator(cfg)
annotator.use_body, annotator.use_face, annotator.use_hand = True, True, True
self.annotators.append(annotator)
self._current_worker = 0
self._worker_lock = threading.Lock()
def _get_annotator(self):
"""Get next available annotator in round-robin fashion"""
with self._worker_lock:
annotator = self.annotators[self._current_worker]
self._current_worker = (self._current_worker + 1) % len(self.annotators)
return annotator
def _process_single_frame(self, frame_data):
"""Process a single frame with error handling"""
frame, frame_idx = frame_data
try:
annotator = self._get_annotator()
# Convert frame
frame = convert_to_numpy(frame)
input_image = HWC3(frame[..., ::-1])
resized_image = resize_image(input_image, annotator.resize_size)
# Process
ret_data, _ = annotator.process(resized_image, frame.shape[:2])
if 'detected_map_handbodyface' in ret_data:
return frame_idx, ret_data['detected_map_handbodyface']
else:
# Create empty frame if no detection
h, w = frame.shape[:2]
return frame_idx, np.zeros((h, w, 3), dtype=np.uint8)
except Exception as e:
print(f"Error processing frame {frame_idx}: {e}")
# Return empty frame on error
h, w = frame.shape[:2] if hasattr(frame, 'shape') else (480, 640)
return frame_idx, np.zeros((h, w, 3), dtype=np.uint8)
def forward(self, frames):
"""Process video frames with optimizations"""
if len(frames) == 0:
return []
# For small number of frames, use serial processing to avoid threading overhead
if len(frames) <= 4:
annotator = self.annotators[0]
ret_frames = []
for frame in frames:
frame = convert_to_numpy(frame)
input_image = HWC3(frame[..., ::-1])
resized_image = resize_image(input_image, annotator.resize_size)
ret_data, _ = annotator.process(resized_image, frame.shape[:2])
if 'detected_map_handbodyface' in ret_data:
ret_frames.append(ret_data['detected_map_handbodyface'])
else:
h, w = frame.shape[:2]
ret_frames.append(np.zeros((h, w, 3), dtype=np.uint8))
return ret_frames
# For larger videos, use parallel processing
frame_data = [(frame, idx) for idx, frame in enumerate(frames)]
results = [None] * len(frames)
# Process in chunks to manage memory
for chunk_start in range(0, len(frame_data), self.chunk_size * self.num_workers):
chunk_end = min(chunk_start + self.chunk_size * self.num_workers, len(frame_data))
chunk_data = frame_data[chunk_start:chunk_end]
with ThreadPoolExecutor(max_workers=self.num_workers) as executor:
chunk_results = list(executor.map(self._process_single_frame, chunk_data))
# Store results in correct order
for frame_idx, result in chunk_results:
results[frame_idx] = result
return results
# Choose which version you want to use:
# Option 1: Body + Face only (original behavior)
class PoseBodyFaceVideoAnnotator(OptimizedPoseBodyFaceVideoAnnotator):
"""Backward compatible class name - Body and Face only"""
# Option 2: Body + Face + Hands (if you want hands)
class PoseBodyFaceHandVideoAnnotator(OptimizedPoseBodyFaceHandVideoAnnotator):
"""Video annotator with hands, body, and face"""
def __init__(self, cfg):
super().__init__(cfg, num_workers=2, chunk_size=4)
# Keep the existing utility functions # Keep the existing utility functions

View File

@ -148,7 +148,8 @@ class OptimizedPyannote31SpeakerSeparator:
both_speaking_mask = np.zeros(audio_length, dtype=bool) both_speaking_mask = np.zeros(audio_length, dtype=bool)
# ── 1) try the proper overlap model ──────────────────────────────── # ── 1) try the proper overlap model ────────────────────────────────
overlap_pipeline = self._get_overlap_pipeline() # overlap_pipeline = self._get_overlap_pipeline() # doesnt work anyway
overlap_pipeline = None
# try the path stored by separate_audio otherwise whatever the # try the path stored by separate_audio otherwise whatever the
# diarization object carries (may be None) # diarization object carries (may be None)

22
wgp.py
View File

@ -3295,7 +3295,7 @@ def get_preprocessor(process_type, inpaint_color):
def process_images_multithread(image_processor, items, process_type, wrap_in_list = True, max_workers: int = os.cpu_count()/ 2) : def process_images_multithread(image_processor, items, process_type, wrap_in_list = True, max_workers: int = os.cpu_count()/ 2) :
if not items: if not items:
return [] return []
max_workers = 11
import concurrent.futures import concurrent.futures
start_time = time.time() start_time = time.time()
# print(f"Preprocessus:{process_type} started") # print(f"Preprocessus:{process_type} started")
@ -3910,6 +3910,7 @@ def generate_video(
prompts = prompt.split("\n") prompts = prompt.split("\n")
prompts = [part for part in prompts if len(prompt)>0] prompts = [part for part in prompts if len(prompt)>0]
parsed_keep_frames_video_source= max_source_video_frames if len(keep_frames_video_source) ==0 else int(keep_frames_video_source)
loras = state["loras"] loras = state["loras"]
@ -4026,13 +4027,13 @@ def generate_video(
frames_to_inject = [None] * (max(frames_positions_list) + 1) frames_to_inject = [None] * (max(frames_positions_list) + 1)
for i, pos in enumerate(frames_positions_list): for i, pos in enumerate(frames_positions_list):
frames_to_inject[pos] = image_refs[i] frames_to_inject[pos] = image_refs[i]
if video_guide == None and video_source == None and not "L" in image_prompt_type: if video_guide == None and video_source == None and not "L" in image_prompt_type and (nb_frames_positions > 0 or "K" in video_prompt_type) :
from wan.utils.utils import resize_lanczos, calculate_new_dimensions, get_outpainting_full_area_dimensions from wan.utils.utils import resize_lanczos, calculate_new_dimensions, get_outpainting_full_area_dimensions
w, h = image_refs[0].size w, h = image_refs[0].size
if outpainting_dims != None: if outpainting_dims != None:
h, w = get_outpainting_full_area_dimensions(h,w, outpainting_dims) h, w = get_outpainting_full_area_dimensions(h,w, outpainting_dims)
default_image_size = calculate_new_dimensions(height, width, h, w, fit_canvas) default_image_size = calculate_new_dimensions(height, width, h, w, fit_canvas)
fit_canvas = None fit_canvas = None
if len(image_refs) > nb_frames_positions: if len(image_refs) > nb_frames_positions:
if hunyuan_avatar: remove_background_images_ref = 0 if hunyuan_avatar: remove_background_images_ref = 0
any_background_ref = "K" in video_prompt_type any_background_ref = "K" in video_prompt_type
@ -4267,8 +4268,7 @@ def generate_video(
from wan.utils.utils import get_video_frame from wan.utils.utils import get_video_frame
refresh_preview["video_source"] = get_video_frame(video_source, 0) refresh_preview["video_source"] = get_video_frame(video_source, 0)
if video_source != None and len(video_source) > 0 and window_no == 1: if video_source != None and len(video_source) > 0 and window_no == 1:
keep_frames_video_source= max_source_video_frames if len(keep_frames_video_source) ==0 else int(keep_frames_video_source) prefix_video = preprocess_video(width=width, height=height,video_in=video_source, max_frames= parsed_keep_frames_video_source , start_frame = 0, fit_canvas= sample_fit_canvas, target_fps = fps, block_size = 32 if ltxv else 16)
prefix_video = preprocess_video(width=width, height=height,video_in=video_source, max_frames= keep_frames_video_source , start_frame = 0, fit_canvas= sample_fit_canvas, target_fps = fps, block_size = 32 if ltxv else 16)
prefix_video = prefix_video.permute(3, 0, 1, 2) prefix_video = prefix_video.permute(3, 0, 1, 2)
prefix_video = prefix_video.float().div_(127.5).sub_(1.) # c, f, h, w prefix_video = prefix_video.float().div_(127.5).sub_(1.) # c, f, h, w
pre_video_guide = prefix_video[:, -reuse_frames:] pre_video_guide = prefix_video[:, -reuse_frames:]
@ -4629,7 +4629,7 @@ def generate_video(
send_cmd("output") send_cmd("output")
seed = set_seed(seed) seed = set_seed(-1)
clear_status(state) clear_status(state)
offload.unload_loras_from_model(trans) offload.unload_loras_from_model(trans)
if len(control_audio_tracks) > 0: if len(control_audio_tracks) > 0: