fx

2025-11-04 14:16:57 +00:00 · 2025-07-09 10:21:18 +02:00 · 2025-07-09 10:21:18 +02:00 · 8d2164aaf1
commit 8d2164aaf1
parent 6a28bb8d4d
4 changed files with 138 additions and 26 deletions
--- a/README.md
+++ b/README.md
@ -11,7 +11,7 @@ WanGP supports the Wan (and derived models), Hunyuan Video and LTV Video models
 - Very Fast on the latest GPUs
 - Easy to use Full Web based interface
 - Auto download of the required model adapted to your specific architecture
- Tools integrated to facilitate Video Generation : Mask Editor, Prompt Enhancer, Temporal and Spatial Generation, MMAudio, Vew
+- Tools integrated to facilitate Video Generation : Mask Editor, Prompt Enhancer, Temporal and Spatial Generation, MMAudio, Video Browser, Pose / Depth / Flow extractor
 - Loras Support to customize each model
 - Queuing system : make your shopping list of videos to generate and come back later 
@ -27,7 +27,7 @@ Of course you will get as well *Multitalk* vanilla and also *Multitalk 720p* as
 And since I am mister nice guy I have enclosed as an exclusivity an *Audio Separator* that will save you time to isolate each voice when using Multitalk with two people.
-As I feel like a resting a bit I haven't produced a nice sample Video to illustrate all these new capabilities. But here is the thing, I ams sure you will publish in the *Share Your Best Video* channel your *Master Pieces*. The best one will be added to the *Announcements Channel* and will bring eternal fame to its author.
+As I feel like resting a bit I haven't produced yet a nice sample Video to illustrate all these new capabilities. But here is the thing, I ams sure you will publish in the *Share Your Best Video* channel your *Master Pieces*. The best ones will be added to the *Announcements Channel* and will bring eternal fame to its authors.
 But wait, there is more:
 - Sliding Windows support has been added anywhere with Wan models, so imagine with text2video recently upgraded in 6.5 into a video2video, you can now upsample very long videos regardless of your VRAM. The good old image2video model can now reuse the last image to produce new videos (as requested by many of you)
--- a/preprocessing/dwpose/pose.py
+++ b/preprocessing/dwpose/pose.py
@ -117,6 +117,20 @@ class PoseAnnotator:
        H, W, C = ori_img.shape
        with torch.no_grad():
            candidate, subset, det_result = self.pose_estimation(ori_img)
            if len(candidate) == 0:
                # No detections - return empty results
                empty_ret_data = {}
                if self.use_body:
                    empty_ret_data["detected_map_body"] = np.zeros((ori_h, ori_w, 3), dtype=np.uint8)
                if self.use_face:
                    empty_ret_data["detected_map_face"] = np.zeros((ori_h, ori_w, 3), dtype=np.uint8)
                if self.use_body and self.use_face:
                    empty_ret_data["detected_map_bodyface"] = np.zeros((ori_h, ori_w, 3), dtype=np.uint8)
                if self.use_hand and self.use_body and self.use_face:
                    empty_ret_data["detected_map_handbodyface"] = np.zeros((ori_h, ori_w, 3), dtype=np.uint8)
                return empty_ret_data, np.array([])
            nums, keys, locs = candidate.shape
            candidate[..., 0] /= float(W)
            candidate[..., 1] /= float(H)
@ -202,17 +216,17 @@ class PoseBodyFaceAnnotator(PoseAnnotator):
 class OptimizedPoseBodyFaceVideoAnnotator:
    """Optimized video annotator with multiple optimization strategies"""
-    def __init__(self, cfg, num_workers=5, chunk_size=8):
+    def __init__(self, cfg, num_workers=2, chunk_size=8):
        self.cfg = cfg
        self.num_workers = num_workers
        self.chunk_size = chunk_size
-        self.use_body, self.use_face, self.use_hand = True, True, False
+        self.use_body, self.use_face, self.use_hand = True, True, True
        # Initialize one annotator per worker to avoid ONNX session conflicts
        self.annotators = []
        for _ in range(num_workers):
            annotator = OptimizedPoseAnnotator(cfg)
-            annotator.use_body, annotator.use_face, annotator.use_hand = True, True, False
+            annotator.use_body, annotator.use_face, annotator.use_hand = True, True, True
            self.annotators.append(annotator)
        self._current_worker = 0
@ -239,8 +253,8 @@ class OptimizedPoseBodyFaceVideoAnnotator:
            # Process
            ret_data, _ = annotator.process(resized_image, frame.shape[:2])
-            if 'detected_map_bodyface' in ret_data:
+            if 'detected_map_handbodyface' in ret_data:
-                return frame_idx, ret_data['detected_map_bodyface']
+                return frame_idx, ret_data['detected_map_handbodyface']
            else:
                # Create empty frame if no detection
                h, w = frame.shape[:2]
@ -267,8 +281,8 @@ class OptimizedPoseBodyFaceVideoAnnotator:
                resized_image = resize_image(input_image, annotator.resize_size)
                ret_data, _ = annotator.process(resized_image, frame.shape[:2])
-                if 'detected_map_bodyface' in ret_data:
+                if 'detected_map_handbodyface' in ret_data:
-                    ret_frames.append(ret_data['detected_map_bodyface'])
+                    ret_frames.append(ret_data['detected_map_handbodyface'])
                else:
                    h, w = frame.shape[:2]
                    ret_frames.append(np.zeros((h, w, 3), dtype=np.uint8))
@ -293,12 +307,109 @@ class OptimizedPoseBodyFaceVideoAnnotator:
        return results
-# Alias for backward compatibility
+class OptimizedPoseBodyFaceHandVideoAnnotator:
-class PoseBodyFaceVideoAnnotator(OptimizedPoseBodyFaceVideoAnnotator):
+    """Optimized video annotator that includes hands, body, and face"""
    """Backward compatible class name"""
    def __init__(self, cfg, num_workers=2, chunk_size=8):
-        # Use optimized version with conservative settings
+        self.cfg = cfg
-        super().__init__(cfg, num_workers=num_workers, chunk_size=chunk_size)
+        self.num_workers = num_workers
        self.chunk_size = chunk_size
        self.use_body, self.use_face, self.use_hand = True, True, True  # Enable hands
        # Initialize one annotator per worker to avoid ONNX session conflicts
        self.annotators = []
        for _ in range(num_workers):
            annotator = OptimizedPoseAnnotator(cfg)
            annotator.use_body, annotator.use_face, annotator.use_hand = True, True, True
            self.annotators.append(annotator)
        self._current_worker = 0
        self._worker_lock = threading.Lock()
    def _get_annotator(self):
        """Get next available annotator in round-robin fashion"""
        with self._worker_lock:
            annotator = self.annotators[self._current_worker]
            self._current_worker = (self._current_worker + 1) % len(self.annotators)
            return annotator
    def _process_single_frame(self, frame_data):
        """Process a single frame with error handling"""
        frame, frame_idx = frame_data
        try:
            annotator = self._get_annotator()
            # Convert frame
            frame = convert_to_numpy(frame)
            input_image = HWC3(frame[..., ::-1])
            resized_image = resize_image(input_image, annotator.resize_size)
            # Process
            ret_data, _ = annotator.process(resized_image, frame.shape[:2])
            if 'detected_map_handbodyface' in ret_data:
                return frame_idx, ret_data['detected_map_handbodyface']
            else:
                # Create empty frame if no detection
                h, w = frame.shape[:2]
                return frame_idx, np.zeros((h, w, 3), dtype=np.uint8)
        except Exception as e:
            print(f"Error processing frame {frame_idx}: {e}")
            # Return empty frame on error
            h, w = frame.shape[:2] if hasattr(frame, 'shape') else (480, 640)
            return frame_idx, np.zeros((h, w, 3), dtype=np.uint8)
    def forward(self, frames):
        """Process video frames with optimizations"""
        if len(frames) == 0:
            return []
        # For small number of frames, use serial processing to avoid threading overhead
        if len(frames) <= 4:
            annotator = self.annotators[0]
            ret_frames = []
            for frame in frames:
                frame = convert_to_numpy(frame)
                input_image = HWC3(frame[..., ::-1])
                resized_image = resize_image(input_image, annotator.resize_size)
                ret_data, _ = annotator.process(resized_image, frame.shape[:2])
                if 'detected_map_handbodyface' in ret_data:
                    ret_frames.append(ret_data['detected_map_handbodyface'])
                else:
                    h, w = frame.shape[:2]
                    ret_frames.append(np.zeros((h, w, 3), dtype=np.uint8))
            return ret_frames
        # For larger videos, use parallel processing
        frame_data = [(frame, idx) for idx, frame in enumerate(frames)]
        results = [None] * len(frames)
        # Process in chunks to manage memory
        for chunk_start in range(0, len(frame_data), self.chunk_size * self.num_workers):
            chunk_end = min(chunk_start + self.chunk_size * self.num_workers, len(frame_data))
            chunk_data = frame_data[chunk_start:chunk_end]
            with ThreadPoolExecutor(max_workers=self.num_workers) as executor:
                chunk_results = list(executor.map(self._process_single_frame, chunk_data))
            # Store results in correct order
            for frame_idx, result in chunk_results:
                results[frame_idx] = result
        return results
 # Choose which version you want to use:
 # Option 1: Body + Face only (original behavior)
 class PoseBodyFaceVideoAnnotator(OptimizedPoseBodyFaceVideoAnnotator):
    """Backward compatible class name - Body and Face only"""
 # Option 2: Body + Face + Hands (if you want hands)
 class PoseBodyFaceHandVideoAnnotator(OptimizedPoseBodyFaceHandVideoAnnotator):
    """Video annotator with hands, body, and face"""
    def __init__(self, cfg):
        super().__init__(cfg, num_workers=2, chunk_size=4)
 # Keep the existing utility functions
--- a/preprocessing/speakers_separator.py
+++ b/preprocessing/speakers_separator.py
@ -148,7 +148,8 @@ class OptimizedPyannote31SpeakerSeparator:
        both_speaking_mask = np.zeros(audio_length, dtype=bool)
        # ── 1) try the proper overlap model ────────────────────────────────
-        overlap_pipeline = self._get_overlap_pipeline()
+        # overlap_pipeline = self._get_overlap_pipeline() # doesnt work anyway
        overlap_pipeline = None
        # try the path stored by separate_audio – otherwise whatever the
        # diarization object carries (may be None)
--- a/wgp.py
+++ b/wgp.py
@ -3295,7 +3295,7 @@ def get_preprocessor(process_type, inpaint_color):
 def process_images_multithread(image_processor, items, process_type, wrap_in_list = True, max_workers: int = os.cpu_count()/ 2) :
    if not items:
       return []    
-    
+    max_workers = 11
    import concurrent.futures
    start_time = time.time()
    # print(f"Preprocessus:{process_type} started")
@ -3910,6 +3910,7 @@ def generate_video(
    prompts = prompt.split("\n")
    prompts = [part for part in prompts if len(prompt)>0]
    parsed_keep_frames_video_source= max_source_video_frames if len(keep_frames_video_source) ==0 else int(keep_frames_video_source) 
    loras = state["loras"]
@ -4026,13 +4027,13 @@ def generate_video(
            frames_to_inject = [None] * (max(frames_positions_list) + 1)
            for i, pos in enumerate(frames_positions_list):
                frames_to_inject[pos] = image_refs[i] 
-            if video_guide == None and video_source == None and not "L" in image_prompt_type:
+        if video_guide == None and video_source == None and not "L" in image_prompt_type and (nb_frames_positions > 0 or "K" in video_prompt_type) :
-                from wan.utils.utils import resize_lanczos, calculate_new_dimensions, get_outpainting_full_area_dimensions
+            from wan.utils.utils import resize_lanczos, calculate_new_dimensions, get_outpainting_full_area_dimensions
-                w, h = image_refs[0].size
+            w, h = image_refs[0].size
-                if outpainting_dims != None:
+            if outpainting_dims != None:
-                    h, w = get_outpainting_full_area_dimensions(h,w, outpainting_dims)
+                h, w = get_outpainting_full_area_dimensions(h,w, outpainting_dims)
-                default_image_size = calculate_new_dimensions(height, width, h, w, fit_canvas)
+            default_image_size = calculate_new_dimensions(height, width, h, w, fit_canvas)
-                fit_canvas = None
+            fit_canvas = None
        if len(image_refs) > nb_frames_positions:  
            if hunyuan_avatar: remove_background_images_ref = 0
            any_background_ref = "K" in video_prompt_type 
@ -4267,8 +4268,7 @@ def generate_video(
                    from wan.utils.utils import get_video_frame
                    refresh_preview["video_source"] = get_video_frame(video_source, 0)
                if video_source != None and len(video_source) > 0 and window_no == 1:
-                    keep_frames_video_source= max_source_video_frames if len(keep_frames_video_source) ==0 else int(keep_frames_video_source) 
+                    prefix_video  = preprocess_video(width=width, height=height,video_in=video_source, max_frames= parsed_keep_frames_video_source , start_frame = 0, fit_canvas= sample_fit_canvas, target_fps = fps, block_size = 32 if ltxv else 16)
                    prefix_video  = preprocess_video(width=width, height=height,video_in=video_source, max_frames= keep_frames_video_source , start_frame = 0, fit_canvas= sample_fit_canvas, target_fps = fps, block_size = 32 if ltxv else 16)
                    prefix_video  = prefix_video.permute(3, 0, 1, 2)
                    prefix_video  = prefix_video.float().div_(127.5).sub_(1.) # c, f, h, w
                    pre_video_guide =  prefix_video[:, -reuse_frames:]
@ -4629,7 +4629,7 @@ def generate_video(
                send_cmd("output")
-        seed = set_seed(seed)
+        seed = set_seed(-1)
    clear_status(state)
    offload.unload_loras_from_model(trans)
    if len(control_audio_tracks) > 0: