mirror of
https://github.com/Wan-Video/Wan2.1.git
synced 2025-11-04 14:16:57 +00:00
fx
This commit is contained in:
parent
6a28bb8d4d
commit
8d2164aaf1
@ -11,7 +11,7 @@ WanGP supports the Wan (and derived models), Hunyuan Video and LTV Video models
|
|||||||
- Very Fast on the latest GPUs
|
- Very Fast on the latest GPUs
|
||||||
- Easy to use Full Web based interface
|
- Easy to use Full Web based interface
|
||||||
- Auto download of the required model adapted to your specific architecture
|
- Auto download of the required model adapted to your specific architecture
|
||||||
- Tools integrated to facilitate Video Generation : Mask Editor, Prompt Enhancer, Temporal and Spatial Generation, MMAudio, Vew
|
- Tools integrated to facilitate Video Generation : Mask Editor, Prompt Enhancer, Temporal and Spatial Generation, MMAudio, Video Browser, Pose / Depth / Flow extractor
|
||||||
- Loras Support to customize each model
|
- Loras Support to customize each model
|
||||||
- Queuing system : make your shopping list of videos to generate and come back later
|
- Queuing system : make your shopping list of videos to generate and come back later
|
||||||
|
|
||||||
@ -27,7 +27,7 @@ Of course you will get as well *Multitalk* vanilla and also *Multitalk 720p* as
|
|||||||
|
|
||||||
And since I am mister nice guy I have enclosed as an exclusivity an *Audio Separator* that will save you time to isolate each voice when using Multitalk with two people.
|
And since I am mister nice guy I have enclosed as an exclusivity an *Audio Separator* that will save you time to isolate each voice when using Multitalk with two people.
|
||||||
|
|
||||||
As I feel like a resting a bit I haven't produced a nice sample Video to illustrate all these new capabilities. But here is the thing, I ams sure you will publish in the *Share Your Best Video* channel your *Master Pieces*. The best one will be added to the *Announcements Channel* and will bring eternal fame to its author.
|
As I feel like resting a bit I haven't produced yet a nice sample Video to illustrate all these new capabilities. But here is the thing, I ams sure you will publish in the *Share Your Best Video* channel your *Master Pieces*. The best ones will be added to the *Announcements Channel* and will bring eternal fame to its authors.
|
||||||
|
|
||||||
But wait, there is more:
|
But wait, there is more:
|
||||||
- Sliding Windows support has been added anywhere with Wan models, so imagine with text2video recently upgraded in 6.5 into a video2video, you can now upsample very long videos regardless of your VRAM. The good old image2video model can now reuse the last image to produce new videos (as requested by many of you)
|
- Sliding Windows support has been added anywhere with Wan models, so imagine with text2video recently upgraded in 6.5 into a video2video, you can now upsample very long videos regardless of your VRAM. The good old image2video model can now reuse the last image to produce new videos (as requested by many of you)
|
||||||
|
|||||||
@ -117,6 +117,20 @@ class PoseAnnotator:
|
|||||||
H, W, C = ori_img.shape
|
H, W, C = ori_img.shape
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
candidate, subset, det_result = self.pose_estimation(ori_img)
|
candidate, subset, det_result = self.pose_estimation(ori_img)
|
||||||
|
|
||||||
|
if len(candidate) == 0:
|
||||||
|
# No detections - return empty results
|
||||||
|
empty_ret_data = {}
|
||||||
|
if self.use_body:
|
||||||
|
empty_ret_data["detected_map_body"] = np.zeros((ori_h, ori_w, 3), dtype=np.uint8)
|
||||||
|
if self.use_face:
|
||||||
|
empty_ret_data["detected_map_face"] = np.zeros((ori_h, ori_w, 3), dtype=np.uint8)
|
||||||
|
if self.use_body and self.use_face:
|
||||||
|
empty_ret_data["detected_map_bodyface"] = np.zeros((ori_h, ori_w, 3), dtype=np.uint8)
|
||||||
|
if self.use_hand and self.use_body and self.use_face:
|
||||||
|
empty_ret_data["detected_map_handbodyface"] = np.zeros((ori_h, ori_w, 3), dtype=np.uint8)
|
||||||
|
return empty_ret_data, np.array([])
|
||||||
|
|
||||||
nums, keys, locs = candidate.shape
|
nums, keys, locs = candidate.shape
|
||||||
candidate[..., 0] /= float(W)
|
candidate[..., 0] /= float(W)
|
||||||
candidate[..., 1] /= float(H)
|
candidate[..., 1] /= float(H)
|
||||||
@ -202,17 +216,17 @@ class PoseBodyFaceAnnotator(PoseAnnotator):
|
|||||||
|
|
||||||
class OptimizedPoseBodyFaceVideoAnnotator:
|
class OptimizedPoseBodyFaceVideoAnnotator:
|
||||||
"""Optimized video annotator with multiple optimization strategies"""
|
"""Optimized video annotator with multiple optimization strategies"""
|
||||||
def __init__(self, cfg, num_workers=5, chunk_size=8):
|
def __init__(self, cfg, num_workers=2, chunk_size=8):
|
||||||
self.cfg = cfg
|
self.cfg = cfg
|
||||||
self.num_workers = num_workers
|
self.num_workers = num_workers
|
||||||
self.chunk_size = chunk_size
|
self.chunk_size = chunk_size
|
||||||
self.use_body, self.use_face, self.use_hand = True, True, False
|
self.use_body, self.use_face, self.use_hand = True, True, True
|
||||||
|
|
||||||
# Initialize one annotator per worker to avoid ONNX session conflicts
|
# Initialize one annotator per worker to avoid ONNX session conflicts
|
||||||
self.annotators = []
|
self.annotators = []
|
||||||
for _ in range(num_workers):
|
for _ in range(num_workers):
|
||||||
annotator = OptimizedPoseAnnotator(cfg)
|
annotator = OptimizedPoseAnnotator(cfg)
|
||||||
annotator.use_body, annotator.use_face, annotator.use_hand = True, True, False
|
annotator.use_body, annotator.use_face, annotator.use_hand = True, True, True
|
||||||
self.annotators.append(annotator)
|
self.annotators.append(annotator)
|
||||||
|
|
||||||
self._current_worker = 0
|
self._current_worker = 0
|
||||||
@ -239,8 +253,8 @@ class OptimizedPoseBodyFaceVideoAnnotator:
|
|||||||
# Process
|
# Process
|
||||||
ret_data, _ = annotator.process(resized_image, frame.shape[:2])
|
ret_data, _ = annotator.process(resized_image, frame.shape[:2])
|
||||||
|
|
||||||
if 'detected_map_bodyface' in ret_data:
|
if 'detected_map_handbodyface' in ret_data:
|
||||||
return frame_idx, ret_data['detected_map_bodyface']
|
return frame_idx, ret_data['detected_map_handbodyface']
|
||||||
else:
|
else:
|
||||||
# Create empty frame if no detection
|
# Create empty frame if no detection
|
||||||
h, w = frame.shape[:2]
|
h, w = frame.shape[:2]
|
||||||
@ -267,8 +281,8 @@ class OptimizedPoseBodyFaceVideoAnnotator:
|
|||||||
resized_image = resize_image(input_image, annotator.resize_size)
|
resized_image = resize_image(input_image, annotator.resize_size)
|
||||||
ret_data, _ = annotator.process(resized_image, frame.shape[:2])
|
ret_data, _ = annotator.process(resized_image, frame.shape[:2])
|
||||||
|
|
||||||
if 'detected_map_bodyface' in ret_data:
|
if 'detected_map_handbodyface' in ret_data:
|
||||||
ret_frames.append(ret_data['detected_map_bodyface'])
|
ret_frames.append(ret_data['detected_map_handbodyface'])
|
||||||
else:
|
else:
|
||||||
h, w = frame.shape[:2]
|
h, w = frame.shape[:2]
|
||||||
ret_frames.append(np.zeros((h, w, 3), dtype=np.uint8))
|
ret_frames.append(np.zeros((h, w, 3), dtype=np.uint8))
|
||||||
@ -293,12 +307,109 @@ class OptimizedPoseBodyFaceVideoAnnotator:
|
|||||||
return results
|
return results
|
||||||
|
|
||||||
|
|
||||||
# Alias for backward compatibility
|
class OptimizedPoseBodyFaceHandVideoAnnotator:
|
||||||
class PoseBodyFaceVideoAnnotator(OptimizedPoseBodyFaceVideoAnnotator):
|
"""Optimized video annotator that includes hands, body, and face"""
|
||||||
"""Backward compatible class name"""
|
|
||||||
def __init__(self, cfg, num_workers=2, chunk_size=8):
|
def __init__(self, cfg, num_workers=2, chunk_size=8):
|
||||||
# Use optimized version with conservative settings
|
self.cfg = cfg
|
||||||
super().__init__(cfg, num_workers=num_workers, chunk_size=chunk_size)
|
self.num_workers = num_workers
|
||||||
|
self.chunk_size = chunk_size
|
||||||
|
self.use_body, self.use_face, self.use_hand = True, True, True # Enable hands
|
||||||
|
|
||||||
|
# Initialize one annotator per worker to avoid ONNX session conflicts
|
||||||
|
self.annotators = []
|
||||||
|
for _ in range(num_workers):
|
||||||
|
annotator = OptimizedPoseAnnotator(cfg)
|
||||||
|
annotator.use_body, annotator.use_face, annotator.use_hand = True, True, True
|
||||||
|
self.annotators.append(annotator)
|
||||||
|
|
||||||
|
self._current_worker = 0
|
||||||
|
self._worker_lock = threading.Lock()
|
||||||
|
|
||||||
|
def _get_annotator(self):
|
||||||
|
"""Get next available annotator in round-robin fashion"""
|
||||||
|
with self._worker_lock:
|
||||||
|
annotator = self.annotators[self._current_worker]
|
||||||
|
self._current_worker = (self._current_worker + 1) % len(self.annotators)
|
||||||
|
return annotator
|
||||||
|
|
||||||
|
def _process_single_frame(self, frame_data):
|
||||||
|
"""Process a single frame with error handling"""
|
||||||
|
frame, frame_idx = frame_data
|
||||||
|
try:
|
||||||
|
annotator = self._get_annotator()
|
||||||
|
|
||||||
|
# Convert frame
|
||||||
|
frame = convert_to_numpy(frame)
|
||||||
|
input_image = HWC3(frame[..., ::-1])
|
||||||
|
resized_image = resize_image(input_image, annotator.resize_size)
|
||||||
|
|
||||||
|
# Process
|
||||||
|
ret_data, _ = annotator.process(resized_image, frame.shape[:2])
|
||||||
|
|
||||||
|
if 'detected_map_handbodyface' in ret_data:
|
||||||
|
return frame_idx, ret_data['detected_map_handbodyface']
|
||||||
|
else:
|
||||||
|
# Create empty frame if no detection
|
||||||
|
h, w = frame.shape[:2]
|
||||||
|
return frame_idx, np.zeros((h, w, 3), dtype=np.uint8)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error processing frame {frame_idx}: {e}")
|
||||||
|
# Return empty frame on error
|
||||||
|
h, w = frame.shape[:2] if hasattr(frame, 'shape') else (480, 640)
|
||||||
|
return frame_idx, np.zeros((h, w, 3), dtype=np.uint8)
|
||||||
|
|
||||||
|
def forward(self, frames):
|
||||||
|
"""Process video frames with optimizations"""
|
||||||
|
if len(frames) == 0:
|
||||||
|
return []
|
||||||
|
|
||||||
|
# For small number of frames, use serial processing to avoid threading overhead
|
||||||
|
if len(frames) <= 4:
|
||||||
|
annotator = self.annotators[0]
|
||||||
|
ret_frames = []
|
||||||
|
for frame in frames:
|
||||||
|
frame = convert_to_numpy(frame)
|
||||||
|
input_image = HWC3(frame[..., ::-1])
|
||||||
|
resized_image = resize_image(input_image, annotator.resize_size)
|
||||||
|
ret_data, _ = annotator.process(resized_image, frame.shape[:2])
|
||||||
|
|
||||||
|
if 'detected_map_handbodyface' in ret_data:
|
||||||
|
ret_frames.append(ret_data['detected_map_handbodyface'])
|
||||||
|
else:
|
||||||
|
h, w = frame.shape[:2]
|
||||||
|
ret_frames.append(np.zeros((h, w, 3), dtype=np.uint8))
|
||||||
|
return ret_frames
|
||||||
|
|
||||||
|
# For larger videos, use parallel processing
|
||||||
|
frame_data = [(frame, idx) for idx, frame in enumerate(frames)]
|
||||||
|
results = [None] * len(frames)
|
||||||
|
|
||||||
|
# Process in chunks to manage memory
|
||||||
|
for chunk_start in range(0, len(frame_data), self.chunk_size * self.num_workers):
|
||||||
|
chunk_end = min(chunk_start + self.chunk_size * self.num_workers, len(frame_data))
|
||||||
|
chunk_data = frame_data[chunk_start:chunk_end]
|
||||||
|
|
||||||
|
with ThreadPoolExecutor(max_workers=self.num_workers) as executor:
|
||||||
|
chunk_results = list(executor.map(self._process_single_frame, chunk_data))
|
||||||
|
|
||||||
|
# Store results in correct order
|
||||||
|
for frame_idx, result in chunk_results:
|
||||||
|
results[frame_idx] = result
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
# Choose which version you want to use:
|
||||||
|
|
||||||
|
# Option 1: Body + Face only (original behavior)
|
||||||
|
class PoseBodyFaceVideoAnnotator(OptimizedPoseBodyFaceVideoAnnotator):
|
||||||
|
"""Backward compatible class name - Body and Face only"""
|
||||||
|
# Option 2: Body + Face + Hands (if you want hands)
|
||||||
|
class PoseBodyFaceHandVideoAnnotator(OptimizedPoseBodyFaceHandVideoAnnotator):
|
||||||
|
"""Video annotator with hands, body, and face"""
|
||||||
|
def __init__(self, cfg):
|
||||||
|
super().__init__(cfg, num_workers=2, chunk_size=4)
|
||||||
|
|
||||||
|
|
||||||
# Keep the existing utility functions
|
# Keep the existing utility functions
|
||||||
|
|||||||
@ -148,7 +148,8 @@ class OptimizedPyannote31SpeakerSeparator:
|
|||||||
both_speaking_mask = np.zeros(audio_length, dtype=bool)
|
both_speaking_mask = np.zeros(audio_length, dtype=bool)
|
||||||
|
|
||||||
# ── 1) try the proper overlap model ────────────────────────────────
|
# ── 1) try the proper overlap model ────────────────────────────────
|
||||||
overlap_pipeline = self._get_overlap_pipeline()
|
# overlap_pipeline = self._get_overlap_pipeline() # doesnt work anyway
|
||||||
|
overlap_pipeline = None
|
||||||
|
|
||||||
# try the path stored by separate_audio – otherwise whatever the
|
# try the path stored by separate_audio – otherwise whatever the
|
||||||
# diarization object carries (may be None)
|
# diarization object carries (may be None)
|
||||||
|
|||||||
22
wgp.py
22
wgp.py
@ -3295,7 +3295,7 @@ def get_preprocessor(process_type, inpaint_color):
|
|||||||
def process_images_multithread(image_processor, items, process_type, wrap_in_list = True, max_workers: int = os.cpu_count()/ 2) :
|
def process_images_multithread(image_processor, items, process_type, wrap_in_list = True, max_workers: int = os.cpu_count()/ 2) :
|
||||||
if not items:
|
if not items:
|
||||||
return []
|
return []
|
||||||
|
max_workers = 11
|
||||||
import concurrent.futures
|
import concurrent.futures
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
# print(f"Preprocessus:{process_type} started")
|
# print(f"Preprocessus:{process_type} started")
|
||||||
@ -3910,6 +3910,7 @@ def generate_video(
|
|||||||
|
|
||||||
prompts = prompt.split("\n")
|
prompts = prompt.split("\n")
|
||||||
prompts = [part for part in prompts if len(prompt)>0]
|
prompts = [part for part in prompts if len(prompt)>0]
|
||||||
|
parsed_keep_frames_video_source= max_source_video_frames if len(keep_frames_video_source) ==0 else int(keep_frames_video_source)
|
||||||
|
|
||||||
|
|
||||||
loras = state["loras"]
|
loras = state["loras"]
|
||||||
@ -4026,13 +4027,13 @@ def generate_video(
|
|||||||
frames_to_inject = [None] * (max(frames_positions_list) + 1)
|
frames_to_inject = [None] * (max(frames_positions_list) + 1)
|
||||||
for i, pos in enumerate(frames_positions_list):
|
for i, pos in enumerate(frames_positions_list):
|
||||||
frames_to_inject[pos] = image_refs[i]
|
frames_to_inject[pos] = image_refs[i]
|
||||||
if video_guide == None and video_source == None and not "L" in image_prompt_type:
|
if video_guide == None and video_source == None and not "L" in image_prompt_type and (nb_frames_positions > 0 or "K" in video_prompt_type) :
|
||||||
from wan.utils.utils import resize_lanczos, calculate_new_dimensions, get_outpainting_full_area_dimensions
|
from wan.utils.utils import resize_lanczos, calculate_new_dimensions, get_outpainting_full_area_dimensions
|
||||||
w, h = image_refs[0].size
|
w, h = image_refs[0].size
|
||||||
if outpainting_dims != None:
|
if outpainting_dims != None:
|
||||||
h, w = get_outpainting_full_area_dimensions(h,w, outpainting_dims)
|
h, w = get_outpainting_full_area_dimensions(h,w, outpainting_dims)
|
||||||
default_image_size = calculate_new_dimensions(height, width, h, w, fit_canvas)
|
default_image_size = calculate_new_dimensions(height, width, h, w, fit_canvas)
|
||||||
fit_canvas = None
|
fit_canvas = None
|
||||||
if len(image_refs) > nb_frames_positions:
|
if len(image_refs) > nb_frames_positions:
|
||||||
if hunyuan_avatar: remove_background_images_ref = 0
|
if hunyuan_avatar: remove_background_images_ref = 0
|
||||||
any_background_ref = "K" in video_prompt_type
|
any_background_ref = "K" in video_prompt_type
|
||||||
@ -4267,8 +4268,7 @@ def generate_video(
|
|||||||
from wan.utils.utils import get_video_frame
|
from wan.utils.utils import get_video_frame
|
||||||
refresh_preview["video_source"] = get_video_frame(video_source, 0)
|
refresh_preview["video_source"] = get_video_frame(video_source, 0)
|
||||||
if video_source != None and len(video_source) > 0 and window_no == 1:
|
if video_source != None and len(video_source) > 0 and window_no == 1:
|
||||||
keep_frames_video_source= max_source_video_frames if len(keep_frames_video_source) ==0 else int(keep_frames_video_source)
|
prefix_video = preprocess_video(width=width, height=height,video_in=video_source, max_frames= parsed_keep_frames_video_source , start_frame = 0, fit_canvas= sample_fit_canvas, target_fps = fps, block_size = 32 if ltxv else 16)
|
||||||
prefix_video = preprocess_video(width=width, height=height,video_in=video_source, max_frames= keep_frames_video_source , start_frame = 0, fit_canvas= sample_fit_canvas, target_fps = fps, block_size = 32 if ltxv else 16)
|
|
||||||
prefix_video = prefix_video.permute(3, 0, 1, 2)
|
prefix_video = prefix_video.permute(3, 0, 1, 2)
|
||||||
prefix_video = prefix_video.float().div_(127.5).sub_(1.) # c, f, h, w
|
prefix_video = prefix_video.float().div_(127.5).sub_(1.) # c, f, h, w
|
||||||
pre_video_guide = prefix_video[:, -reuse_frames:]
|
pre_video_guide = prefix_video[:, -reuse_frames:]
|
||||||
@ -4629,7 +4629,7 @@ def generate_video(
|
|||||||
|
|
||||||
send_cmd("output")
|
send_cmd("output")
|
||||||
|
|
||||||
seed = set_seed(seed)
|
seed = set_seed(-1)
|
||||||
clear_status(state)
|
clear_status(state)
|
||||||
offload.unload_loras_from_model(trans)
|
offload.unload_loras_from_model(trans)
|
||||||
if len(control_audio_tracks) > 0:
|
if len(control_audio_tracks) > 0:
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user