mirror of
https://github.com/Wan-Video/Wan2.1.git
synced 2025-12-15 11:43:21 +00:00
Support for Hunyuan Video Avatar
This commit is contained in:
parent
7670af9610
commit
1976868f6a
@ -21,7 +21,7 @@ WanGP supports the Wan (and derived models), Hunyuan Video and LTV Video models
|
||||
|
||||
|
||||
## 🔥 Latest News!!
|
||||
* May 26 2025: 👋 WanGP v5.31 : Added Phantom 14B, a model that you can use to transfer objects / people in the video. My preference goes to Vace that remains the king of controlnets.
|
||||
* May 28 2025: 👋 WanGP v5.31 : Added Phantom 14B, a model that you can use to transfer objects / people in the video. My preference goes to Vace that remains the king of controlnets.
|
||||
* May 26 2025: 👋 WanGP v5.3 : Happy with a Video generation and want to do more generations using the same settings but you can't remember what you did or you find it to hard to copy / paste one per one each setting from the file metadata ? Rejoice ! There are now multiple ways to turn this tedious process into a one click task:
|
||||
- Select one Video recently generated in the Video Gallery and click *Use Selected Video Settings*
|
||||
- Click *Drop File Here* and select a Video you saved somewhere, if the settings metadata have been saved with the Video you will be able to extract them automatically
|
||||
|
||||
170
hyvideo/data_kits/audio_dataset.py
Normal file
170
hyvideo/data_kits/audio_dataset.py
Normal file
@ -0,0 +1,170 @@
|
||||
import os
|
||||
import cv2
|
||||
import math
|
||||
import json
|
||||
import torch
|
||||
import random
|
||||
import librosa
|
||||
import traceback
|
||||
import torchvision
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from PIL import Image
|
||||
from einops import rearrange
|
||||
from torch.utils.data import Dataset
|
||||
from decord import VideoReader, cpu
|
||||
from transformers import CLIPImageProcessor
|
||||
import torchvision.transforms as transforms
|
||||
from torchvision.transforms import ToPILImage
|
||||
|
||||
|
||||
|
||||
def get_audio_feature(feature_extractor, audio_path):
|
||||
audio_input, sampling_rate = librosa.load(audio_path, sr=16000)
|
||||
assert sampling_rate == 16000
|
||||
|
||||
audio_features = []
|
||||
window = 750*640
|
||||
for i in range(0, len(audio_input), window):
|
||||
audio_feature = feature_extractor(audio_input[i:i+window],
|
||||
sampling_rate=sampling_rate,
|
||||
return_tensors="pt",
|
||||
).input_features
|
||||
audio_features.append(audio_feature)
|
||||
|
||||
audio_features = torch.cat(audio_features, dim=-1)
|
||||
return audio_features, len(audio_input) // 640
|
||||
|
||||
|
||||
class VideoAudioTextLoaderVal(Dataset):
|
||||
def __init__(
|
||||
self,
|
||||
image_size: int,
|
||||
meta_file: str,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__()
|
||||
self.meta_file = meta_file
|
||||
self.image_size = image_size
|
||||
self.text_encoder = kwargs.get("text_encoder", None) # llava_text_encoder
|
||||
self.text_encoder_2 = kwargs.get("text_encoder_2", None) # clipL_text_encoder
|
||||
self.feature_extractor = kwargs.get("feature_extractor", None)
|
||||
self.meta_files = []
|
||||
|
||||
csv_data = pd.read_csv(meta_file)
|
||||
for idx in range(len(csv_data)):
|
||||
self.meta_files.append(
|
||||
{
|
||||
"videoid": str(csv_data["videoid"][idx]),
|
||||
"image_path": str(csv_data["image"][idx]),
|
||||
"audio_path": str(csv_data["audio"][idx]),
|
||||
"prompt": str(csv_data["prompt"][idx]),
|
||||
"fps": float(csv_data["fps"][idx])
|
||||
}
|
||||
)
|
||||
|
||||
self.llava_transform = transforms.Compose(
|
||||
[
|
||||
transforms.Resize((336, 336), interpolation=transforms.InterpolationMode.BILINEAR),
|
||||
transforms.ToTensor(),
|
||||
transforms.Normalize((0.48145466, 0.4578275, 0.4082107), (0.26862954, 0.26130258, 0.27577711)),
|
||||
]
|
||||
)
|
||||
self.clip_image_processor = CLIPImageProcessor()
|
||||
|
||||
self.device = torch.device("cuda")
|
||||
self.weight_dtype = torch.float16
|
||||
|
||||
|
||||
def __len__(self):
|
||||
return len(self.meta_files)
|
||||
|
||||
@staticmethod
|
||||
def get_text_tokens(text_encoder, description, dtype_encode="video"):
|
||||
text_inputs = text_encoder.text2tokens(description, data_type=dtype_encode)
|
||||
text_ids = text_inputs["input_ids"].squeeze(0)
|
||||
text_mask = text_inputs["attention_mask"].squeeze(0)
|
||||
return text_ids, text_mask
|
||||
|
||||
def get_batch_data(self, idx):
|
||||
meta_file = self.meta_files[idx]
|
||||
videoid = meta_file["videoid"]
|
||||
image_path = meta_file["image_path"]
|
||||
audio_path = meta_file["audio_path"]
|
||||
prompt = "Authentic, Realistic, Natural, High-quality, Lens-Fixed, " + meta_file["prompt"]
|
||||
fps = meta_file["fps"]
|
||||
|
||||
img_size = self.image_size
|
||||
ref_image = Image.open(image_path).convert('RGB')
|
||||
|
||||
# Resize reference image
|
||||
w, h = ref_image.size
|
||||
scale = img_size / min(w, h)
|
||||
new_w = round(w * scale / 64) * 64
|
||||
new_h = round(h * scale / 64) * 64
|
||||
|
||||
if img_size == 704:
|
||||
img_size_long = 1216
|
||||
if new_w * new_h > img_size * img_size_long:
|
||||
import math
|
||||
scale = math.sqrt(img_size * img_size_long / w / h)
|
||||
new_w = round(w * scale / 64) * 64
|
||||
new_h = round(h * scale / 64) * 64
|
||||
|
||||
ref_image = ref_image.resize((new_w, new_h), Image.LANCZOS)
|
||||
|
||||
ref_image = np.array(ref_image)
|
||||
ref_image = torch.from_numpy(ref_image)
|
||||
|
||||
audio_input, audio_len = get_audio_feature(self.feature_extractor, audio_path)
|
||||
audio_prompts = audio_input[0]
|
||||
|
||||
motion_bucket_id_heads = np.array([25] * 4)
|
||||
motion_bucket_id_exps = np.array([30] * 4)
|
||||
motion_bucket_id_heads = torch.from_numpy(motion_bucket_id_heads)
|
||||
motion_bucket_id_exps = torch.from_numpy(motion_bucket_id_exps)
|
||||
fps = torch.from_numpy(np.array(fps))
|
||||
|
||||
to_pil = ToPILImage()
|
||||
pixel_value_ref = rearrange(ref_image.clone().unsqueeze(0), "b h w c -> b c h w") # (b c h w)
|
||||
|
||||
pixel_value_ref_llava = [self.llava_transform(to_pil(image)) for image in pixel_value_ref]
|
||||
pixel_value_ref_llava = torch.stack(pixel_value_ref_llava, dim=0)
|
||||
pixel_value_ref_clip = self.clip_image_processor(
|
||||
images=Image.fromarray((pixel_value_ref[0].permute(1,2,0)).data.cpu().numpy().astype(np.uint8)),
|
||||
return_tensors="pt"
|
||||
).pixel_values[0]
|
||||
pixel_value_ref_clip = pixel_value_ref_clip.unsqueeze(0)
|
||||
|
||||
# Encode text prompts
|
||||
|
||||
text_ids, text_mask = self.get_text_tokens(self.text_encoder, prompt)
|
||||
text_ids_2, text_mask_2 = self.get_text_tokens(self.text_encoder_2, prompt)
|
||||
|
||||
# Output batch
|
||||
batch = {
|
||||
"text_prompt": prompt, #
|
||||
"videoid": videoid,
|
||||
"pixel_value_ref": pixel_value_ref.to(dtype=torch.float16), # 参考图,用于vae提特征 (1, 3, h, w), 取值范围(0, 255)
|
||||
"pixel_value_ref_llava": pixel_value_ref_llava.to(dtype=torch.float16), # 参考图,用于llava提特征 (1, 3, 336, 336), 取值范围 = CLIP取值范围
|
||||
"pixel_value_ref_clip": pixel_value_ref_clip.to(dtype=torch.float16), # 参考图,用于clip_image_encoder提特征 (1, 3, 244, 244), 取值范围 = CLIP取值范围
|
||||
"audio_prompts": audio_prompts.to(dtype=torch.float16),
|
||||
"motion_bucket_id_heads": motion_bucket_id_heads.to(dtype=text_ids.dtype),
|
||||
"motion_bucket_id_exps": motion_bucket_id_exps.to(dtype=text_ids.dtype),
|
||||
"fps": fps.to(dtype=torch.float16),
|
||||
"text_ids": text_ids.clone(), # 对应llava_text_encoder
|
||||
"text_mask": text_mask.clone(), # 对应llava_text_encoder
|
||||
"text_ids_2": text_ids_2.clone(), # 对应clip_text_encoder
|
||||
"text_mask_2": text_mask_2.clone(), # 对应clip_text_encoder
|
||||
"audio_len": audio_len,
|
||||
"image_path": image_path,
|
||||
"audio_path": audio_path,
|
||||
}
|
||||
return batch
|
||||
|
||||
def __getitem__(self, idx):
|
||||
return self.get_batch_data(idx)
|
||||
|
||||
|
||||
|
||||
|
||||
72
hyvideo/data_kits/audio_preprocessor.py
Normal file
72
hyvideo/data_kits/audio_preprocessor.py
Normal file
@ -0,0 +1,72 @@
|
||||
|
||||
import os
|
||||
import cv2
|
||||
import json
|
||||
import time
|
||||
import decord
|
||||
import einops
|
||||
import librosa
|
||||
import torch
|
||||
import random
|
||||
import argparse
|
||||
import traceback
|
||||
import numpy as np
|
||||
from tqdm import tqdm
|
||||
from PIL import Image
|
||||
from einops import rearrange
|
||||
|
||||
|
||||
|
||||
def get_facemask(ref_image, align_instance, area=1.25):
|
||||
# ref_image: (b f c h w)
|
||||
bsz, f, c, h, w = ref_image.shape
|
||||
images = rearrange(ref_image, "b f c h w -> (b f) h w c").data.cpu().numpy().astype(np.uint8)
|
||||
face_masks = []
|
||||
for image in images:
|
||||
image_pil = Image.fromarray(image).convert("RGB")
|
||||
_, _, bboxes_list = align_instance(np.array(image_pil)[:,:,[2,1,0]], maxface=True)
|
||||
try:
|
||||
bboxSrc = bboxes_list[0]
|
||||
except:
|
||||
bboxSrc = [0, 0, w, h]
|
||||
x1, y1, ww, hh = bboxSrc
|
||||
x2, y2 = x1 + ww, y1 + hh
|
||||
ww, hh = (x2-x1) * area, (y2-y1) * area
|
||||
center = [(x2+x1)//2, (y2+y1)//2]
|
||||
x1 = max(center[0] - ww//2, 0)
|
||||
y1 = max(center[1] - hh//2, 0)
|
||||
x2 = min(center[0] + ww//2, w)
|
||||
y2 = min(center[1] + hh//2, h)
|
||||
|
||||
face_mask = np.zeros_like(np.array(image_pil))
|
||||
face_mask[int(y1):int(y2), int(x1):int(x2)] = 1.0
|
||||
face_masks.append(torch.from_numpy(face_mask[...,:1]))
|
||||
face_masks = torch.stack(face_masks, dim=0) # (b*f, h, w, c)
|
||||
face_masks = rearrange(face_masks, "(b f) h w c -> b c f h w", b=bsz, f=f)
|
||||
face_masks = face_masks.to(device=ref_image.device, dtype=ref_image.dtype)
|
||||
return face_masks
|
||||
|
||||
|
||||
def encode_audio(wav2vec, audio_feats, fps, num_frames=129):
|
||||
if fps == 25:
|
||||
start_ts = [0]
|
||||
step_ts = [1]
|
||||
elif fps == 12.5:
|
||||
start_ts = [0]
|
||||
step_ts = [2]
|
||||
num_frames = min(num_frames, 400)
|
||||
audio_feats = wav2vec.encoder(audio_feats.unsqueeze(0)[:, :, :3000], output_hidden_states=True).hidden_states
|
||||
audio_feats = torch.stack(audio_feats, dim=2)
|
||||
audio_feats = torch.cat([torch.zeros_like(audio_feats[:,:4]), audio_feats], 1)
|
||||
|
||||
audio_prompts = []
|
||||
for bb in range(1):
|
||||
audio_feats_list = []
|
||||
for f in range(num_frames):
|
||||
cur_t = (start_ts[bb] + f * step_ts[bb]) * 2
|
||||
audio_clip = audio_feats[bb:bb+1, cur_t: cur_t+10]
|
||||
audio_feats_list.append(audio_clip)
|
||||
audio_feats_list = torch.stack(audio_feats_list, 1)
|
||||
audio_prompts.append(audio_feats_list)
|
||||
audio_prompts = torch.cat(audio_prompts)
|
||||
return audio_prompts
|
||||
41
hyvideo/data_kits/data_tools.py
Normal file
41
hyvideo/data_kits/data_tools.py
Normal file
@ -0,0 +1,41 @@
|
||||
import os
|
||||
import cv2
|
||||
import torch
|
||||
import numpy as np
|
||||
import imageio
|
||||
import torchvision
|
||||
from einops import rearrange
|
||||
|
||||
|
||||
def save_videos_grid(videos: torch.Tensor, path: str, rescale=False, n_rows=6, fps=8, quality=8):
|
||||
videos = rearrange(videos, "b c t h w -> t b c h w")
|
||||
outputs = []
|
||||
for x in videos:
|
||||
x = torchvision.utils.make_grid(x, nrow=n_rows)
|
||||
x = x.transpose(0, 1).transpose(1, 2).squeeze(-1)
|
||||
if rescale:
|
||||
x = (x + 1.0) / 2.0 # -1,1 -> 0,1
|
||||
x = torch.clamp(x,0,1)
|
||||
x = (x * 255).numpy().astype(np.uint8)
|
||||
outputs.append(x)
|
||||
|
||||
os.makedirs(os.path.dirname(path), exist_ok=True)
|
||||
imageio.mimsave(path, outputs, fps=fps, quality=quality)
|
||||
|
||||
def pad_image(crop_img, size, color=(255, 255, 255), resize_ratio=1):
|
||||
crop_h, crop_w = crop_img.shape[:2]
|
||||
target_w, target_h = size
|
||||
scale_h, scale_w = target_h / crop_h, target_w / crop_w
|
||||
if scale_w > scale_h:
|
||||
resize_h = int(target_h*resize_ratio)
|
||||
resize_w = int(crop_w / crop_h * resize_h)
|
||||
else:
|
||||
resize_w = int(target_w*resize_ratio)
|
||||
resize_h = int(crop_h / crop_w * resize_w)
|
||||
crop_img = cv2.resize(crop_img, (resize_w, resize_h))
|
||||
pad_left = (target_w - resize_w) // 2
|
||||
pad_top = (target_h - resize_h) // 2
|
||||
pad_right = target_w - resize_w - pad_left
|
||||
pad_bottom = target_h - resize_h - pad_top
|
||||
crop_img = cv2.copyMakeBorder(crop_img, pad_top, pad_bottom, pad_left, pad_right, cv2.BORDER_CONSTANT, value=color)
|
||||
return crop_img
|
||||
1
hyvideo/data_kits/face_align/__init__.py
Normal file
1
hyvideo/data_kits/face_align/__init__.py
Normal file
@ -0,0 +1 @@
|
||||
from .align import AlignImage
|
||||
34
hyvideo/data_kits/face_align/align.py
Normal file
34
hyvideo/data_kits/face_align/align.py
Normal file
@ -0,0 +1,34 @@
|
||||
import os
|
||||
import sys
|
||||
import torch
|
||||
from .detface import DetFace
|
||||
|
||||
class AlignImage(object):
|
||||
def __init__(self, device='cuda', det_path=''):
|
||||
self.facedet = DetFace(pt_path=det_path, confThreshold=0.5, nmsThreshold=0.45, device=device)
|
||||
|
||||
@torch.no_grad()
|
||||
def __call__(self, im, maxface=False):
|
||||
bboxes, kpss, scores = self.facedet.detect(im)
|
||||
face_num = bboxes.shape[0]
|
||||
|
||||
five_pts_list = []
|
||||
scores_list = []
|
||||
bboxes_list = []
|
||||
for i in range(face_num):
|
||||
five_pts_list.append(kpss[i].reshape(5,2))
|
||||
scores_list.append(scores[i])
|
||||
bboxes_list.append(bboxes[i])
|
||||
|
||||
if maxface and face_num>1:
|
||||
max_idx = 0
|
||||
max_area = (bboxes[0, 2])*(bboxes[0, 3])
|
||||
for i in range(1, face_num):
|
||||
area = (bboxes[i,2])*(bboxes[i,3])
|
||||
if area>max_area:
|
||||
max_idx = i
|
||||
five_pts_list = [five_pts_list[max_idx]]
|
||||
scores_list = [scores_list[max_idx]]
|
||||
bboxes_list = [bboxes_list[max_idx]]
|
||||
|
||||
return five_pts_list, scores_list, bboxes_list
|
||||
283
hyvideo/data_kits/face_align/detface.py
Normal file
283
hyvideo/data_kits/face_align/detface.py
Normal file
@ -0,0 +1,283 @@
|
||||
# -*- coding: UTF-8 -*-
|
||||
import os
|
||||
import cv2
|
||||
import numpy as np
|
||||
import torch
|
||||
import torchvision
|
||||
|
||||
|
||||
def xyxy2xywh(x):
|
||||
# Convert nx4 boxes from [x1, y1, x2, y2] to [x, y, w, h] where xy1=top-left, xy2=bottom-right
|
||||
y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x)
|
||||
y[:, 0] = (x[:, 0] + x[:, 2]) / 2 # x center
|
||||
y[:, 1] = (x[:, 1] + x[:, 3]) / 2 # y center
|
||||
y[:, 2] = x[:, 2] - x[:, 0] # width
|
||||
y[:, 3] = x[:, 3] - x[:, 1] # height
|
||||
return y
|
||||
|
||||
|
||||
def xywh2xyxy(x):
|
||||
# Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right
|
||||
y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x)
|
||||
y[:, 0] = x[:, 0] - x[:, 2] / 2 # top left x
|
||||
y[:, 1] = x[:, 1] - x[:, 3] / 2 # top left y
|
||||
y[:, 2] = x[:, 0] + x[:, 2] / 2 # bottom right x
|
||||
y[:, 3] = x[:, 1] + x[:, 3] / 2 # bottom right y
|
||||
return y
|
||||
|
||||
|
||||
def box_iou(box1, box2):
|
||||
# https://github.com/pytorch/vision/blob/master/torchvision/ops/boxes.py
|
||||
"""
|
||||
Return intersection-over-union (Jaccard index) of boxes.
|
||||
Both sets of boxes are expected to be in (x1, y1, x2, y2) format.
|
||||
Arguments:
|
||||
box1 (Tensor[N, 4])
|
||||
box2 (Tensor[M, 4])
|
||||
Returns:
|
||||
iou (Tensor[N, M]): the NxM matrix containing the pairwise
|
||||
IoU values for every element in boxes1 and boxes2
|
||||
"""
|
||||
|
||||
def box_area(box):
|
||||
# box = 4xn
|
||||
return (box[2] - box[0]) * (box[3] - box[1])
|
||||
|
||||
area1 = box_area(box1.T)
|
||||
area2 = box_area(box2.T)
|
||||
|
||||
# inter(N,M) = (rb(N,M,2) - lt(N,M,2)).clamp(0).prod(2)
|
||||
inter = (torch.min(box1[:, None, 2:], box2[:, 2:]) -
|
||||
torch.max(box1[:, None, :2], box2[:, :2])).clamp(0).prod(2)
|
||||
# iou = inter / (area1 + area2 - inter)
|
||||
return inter / (area1[:, None] + area2 - inter)
|
||||
|
||||
|
||||
def scale_coords(img1_shape, coords, img0_shape, ratio_pad=None):
|
||||
# Rescale coords (xyxy) from img1_shape to img0_shape
|
||||
if ratio_pad is None: # calculate from img0_shape
|
||||
gain = min(img1_shape[0] / img0_shape[0], img1_shape[1] / img0_shape[1]) # gain = old / new
|
||||
pad = (img1_shape[1] - img0_shape[1] * gain) / 2, (img1_shape[0] - img0_shape[0] * gain) / 2 # wh padding
|
||||
else:
|
||||
gain = ratio_pad[0][0]
|
||||
pad = ratio_pad[1]
|
||||
|
||||
coords[:, [0, 2]] -= pad[0] # x padding
|
||||
coords[:, [1, 3]] -= pad[1] # y padding
|
||||
coords[:, :4] /= gain
|
||||
clip_coords(coords, img0_shape)
|
||||
return coords
|
||||
|
||||
|
||||
def clip_coords(boxes, img_shape):
|
||||
# Clip bounding xyxy bounding boxes to image shape (height, width)
|
||||
boxes[:, 0].clamp_(0, img_shape[1]) # x1
|
||||
boxes[:, 1].clamp_(0, img_shape[0]) # y1
|
||||
boxes[:, 2].clamp_(0, img_shape[1]) # x2
|
||||
boxes[:, 3].clamp_(0, img_shape[0]) # y2
|
||||
|
||||
|
||||
def scale_coords_landmarks(img1_shape, coords, img0_shape, ratio_pad=None):
|
||||
# Rescale coords (xyxy) from img1_shape to img0_shape
|
||||
if ratio_pad is None: # calculate from img0_shape
|
||||
gain = min(img1_shape[0] / img0_shape[0], img1_shape[1] / img0_shape[1]) # gain = old / new
|
||||
pad = (img1_shape[1] - img0_shape[1] * gain) / 2, (img1_shape[0] - img0_shape[0] * gain) / 2 # wh padding
|
||||
else:
|
||||
gain = ratio_pad[0][0]
|
||||
pad = ratio_pad[1]
|
||||
|
||||
coords[:, [0, 2, 4, 6, 8]] -= pad[0] # x padding
|
||||
coords[:, [1, 3, 5, 7, 9]] -= pad[1] # y padding
|
||||
coords[:, :10] /= gain
|
||||
#clip_coords(coords, img0_shape)
|
||||
coords[:, 0].clamp_(0, img0_shape[1]) # x1
|
||||
coords[:, 1].clamp_(0, img0_shape[0]) # y1
|
||||
coords[:, 2].clamp_(0, img0_shape[1]) # x2
|
||||
coords[:, 3].clamp_(0, img0_shape[0]) # y2
|
||||
coords[:, 4].clamp_(0, img0_shape[1]) # x3
|
||||
coords[:, 5].clamp_(0, img0_shape[0]) # y3
|
||||
coords[:, 6].clamp_(0, img0_shape[1]) # x4
|
||||
coords[:, 7].clamp_(0, img0_shape[0]) # y4
|
||||
coords[:, 8].clamp_(0, img0_shape[1]) # x5
|
||||
coords[:, 9].clamp_(0, img0_shape[0]) # y5
|
||||
return coords
|
||||
|
||||
|
||||
def show_results(img, xywh, conf, landmarks, class_num):
|
||||
h,w,c = img.shape
|
||||
tl = 1 or round(0.002 * (h + w) / 2) + 1 # line/font thickness
|
||||
x1 = int(xywh[0] * w - 0.5 * xywh[2] * w)
|
||||
y1 = int(xywh[1] * h - 0.5 * xywh[3] * h)
|
||||
x2 = int(xywh[0] * w + 0.5 * xywh[2] * w)
|
||||
y2 = int(xywh[1] * h + 0.5 * xywh[3] * h)
|
||||
cv2.rectangle(img, (x1,y1), (x2, y2), (0,255,0), thickness=tl, lineType=cv2.LINE_AA)
|
||||
|
||||
clors = [(255,0,0),(0,255,0),(0,0,255),(255,255,0),(0,255,255)]
|
||||
|
||||
for i in range(5):
|
||||
point_x = int(landmarks[2 * i] * w)
|
||||
point_y = int(landmarks[2 * i + 1] * h)
|
||||
cv2.circle(img, (point_x, point_y), tl+1, clors[i], -1)
|
||||
|
||||
tf = max(tl - 1, 1) # font thickness
|
||||
label = str(conf)[:5]
|
||||
cv2.putText(img, label, (x1, y1 - 2), 0, tl / 3, [225, 255, 255], thickness=tf, lineType=cv2.LINE_AA)
|
||||
return img
|
||||
|
||||
|
||||
def make_divisible(x, divisor):
|
||||
# Returns x evenly divisible by divisor
|
||||
return (x // divisor) * divisor
|
||||
|
||||
|
||||
def non_max_suppression_face(prediction, conf_thres=0.5, iou_thres=0.45, classes=None, agnostic=False, labels=()):
|
||||
"""Performs Non-Maximum Suppression (NMS) on inference results
|
||||
Returns:
|
||||
detections with shape: nx6 (x1, y1, x2, y2, conf, cls)
|
||||
"""
|
||||
|
||||
nc = prediction.shape[2] - 15 # number of classes
|
||||
xc = prediction[..., 4] > conf_thres # candidates
|
||||
|
||||
# Settings
|
||||
min_wh, max_wh = 2, 4096 # (pixels) minimum and maximum box width and height
|
||||
# time_limit = 10.0 # seconds to quit after
|
||||
redundant = True # require redundant detections
|
||||
multi_label = nc > 1 # multiple labels per box (adds 0.5ms/img)
|
||||
merge = False # use merge-NMS
|
||||
|
||||
# t = time.time()
|
||||
output = [torch.zeros((0, 16), device=prediction.device)] * prediction.shape[0]
|
||||
for xi, x in enumerate(prediction): # image index, image inference
|
||||
# Apply constraints
|
||||
# x[((x[..., 2:4] < min_wh) | (x[..., 2:4] > max_wh)).any(1), 4] = 0 # width-height
|
||||
x = x[xc[xi]] # confidence
|
||||
|
||||
# Cat apriori labels if autolabelling
|
||||
if labels and len(labels[xi]):
|
||||
l = labels[xi]
|
||||
v = torch.zeros((len(l), nc + 15), device=x.device)
|
||||
v[:, :4] = l[:, 1:5] # box
|
||||
v[:, 4] = 1.0 # conf
|
||||
v[range(len(l)), l[:, 0].long() + 15] = 1.0 # cls
|
||||
x = torch.cat((x, v), 0)
|
||||
|
||||
# If none remain process next image
|
||||
if not x.shape[0]:
|
||||
continue
|
||||
|
||||
# Compute conf
|
||||
x[:, 15:] *= x[:, 4:5] # conf = obj_conf * cls_conf
|
||||
|
||||
# Box (center x, center y, width, height) to (x1, y1, x2, y2)
|
||||
box = xywh2xyxy(x[:, :4])
|
||||
|
||||
# Detections matrix nx6 (xyxy, conf, landmarks, cls)
|
||||
if multi_label:
|
||||
i, j = (x[:, 15:] > conf_thres).nonzero(as_tuple=False).T
|
||||
x = torch.cat((box[i], x[i, j + 15, None], x[i, 5:15] ,j[:, None].float()), 1)
|
||||
else: # best class only
|
||||
conf, j = x[:, 15:].max(1, keepdim=True)
|
||||
x = torch.cat((box, conf, x[:, 5:15], j.float()), 1)[conf.view(-1) > conf_thres]
|
||||
|
||||
# Filter by class
|
||||
if classes is not None:
|
||||
x = x[(x[:, 5:6] == torch.tensor(classes, device=x.device)).any(1)]
|
||||
|
||||
# If none remain process next image
|
||||
n = x.shape[0] # number of boxes
|
||||
if not n:
|
||||
continue
|
||||
|
||||
# Batched NMS
|
||||
c = x[:, 15:16] * (0 if agnostic else max_wh) # classes
|
||||
boxes, scores = x[:, :4] + c, x[:, 4] # boxes (offset by class), scores
|
||||
i = torchvision.ops.nms(boxes, scores, iou_thres) # NMS
|
||||
#if i.shape[0] > max_det: # limit detections
|
||||
# i = i[:max_det]
|
||||
if merge and (1 < n < 3E3): # Merge NMS (boxes merged using weighted mean)
|
||||
# update boxes as boxes(i,4) = weights(i,n) * boxes(n,4)
|
||||
iou = box_iou(boxes[i], boxes) > iou_thres # iou matrix
|
||||
weights = iou * scores[None] # box weights
|
||||
x[i, :4] = torch.mm(weights, x[:, :4]).float() / weights.sum(1, keepdim=True) # merged boxes
|
||||
if redundant:
|
||||
i = i[iou.sum(1) > 1] # require redundancy
|
||||
|
||||
output[xi] = x[i]
|
||||
# if (time.time() - t) > time_limit:
|
||||
# break # time limit exceeded
|
||||
|
||||
return output
|
||||
|
||||
|
||||
class DetFace():
|
||||
def __init__(self, pt_path, confThreshold=0.5, nmsThreshold=0.45, device='cuda'):
|
||||
assert os.path.exists(pt_path)
|
||||
|
||||
self.inpSize = 416
|
||||
self.conf_thres = confThreshold
|
||||
self.iou_thres = nmsThreshold
|
||||
self.test_device = torch.device(device if torch.cuda.is_available() else "cpu")
|
||||
self.model = torch.jit.load(pt_path).to(self.test_device)
|
||||
self.last_w = 416
|
||||
self.last_h = 416
|
||||
self.grids = None
|
||||
|
||||
@torch.no_grad()
|
||||
def detect(self, srcimg):
|
||||
# t0=time.time()
|
||||
|
||||
h0, w0 = srcimg.shape[:2] # orig hw
|
||||
r = self.inpSize / min(h0, w0) # resize image to img_size
|
||||
h1 = int(h0*r+31)//32*32
|
||||
w1 = int(w0*r+31)//32*32
|
||||
|
||||
img = cv2.resize(srcimg, (w1,h1), interpolation=cv2.INTER_LINEAR)
|
||||
|
||||
# Convert
|
||||
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) # BGR to RGB
|
||||
|
||||
# Run inference
|
||||
img = torch.from_numpy(img).to(self.test_device).permute(2,0,1)
|
||||
img = img.float()/255 # uint8 to fp16/32 0-1
|
||||
if img.ndimension() == 3:
|
||||
img = img.unsqueeze(0)
|
||||
|
||||
# Inference
|
||||
if h1 != self.last_h or w1 != self.last_w or self.grids is None:
|
||||
grids = []
|
||||
for scale in [8,16,32]:
|
||||
ny = h1//scale
|
||||
nx = w1//scale
|
||||
yv, xv = torch.meshgrid([torch.arange(ny), torch.arange(nx)])
|
||||
grid = torch.stack((xv, yv), 2).view((1,1,ny, nx, 2)).float()
|
||||
grids.append(grid.to(self.test_device))
|
||||
self.grids = grids
|
||||
self.last_w = w1
|
||||
self.last_h = h1
|
||||
|
||||
pred = self.model(img, self.grids).cpu()
|
||||
|
||||
# Apply NMS
|
||||
det = non_max_suppression_face(pred, self.conf_thres, self.iou_thres)[0]
|
||||
# Process detections
|
||||
# det = pred[0]
|
||||
bboxes = np.zeros((det.shape[0], 4))
|
||||
kpss = np.zeros((det.shape[0], 5, 2))
|
||||
scores = np.zeros((det.shape[0]))
|
||||
# gn = torch.tensor([w0, h0, w0, h0]).to(pred) # normalization gain whwh
|
||||
# gn_lks = torch.tensor([w0, h0, w0, h0, w0, h0, w0, h0, w0, h0]).to(pred) # normalization gain landmarks
|
||||
det = det.cpu().numpy()
|
||||
|
||||
for j in range(det.shape[0]):
|
||||
# xywh = (xyxy2xywh(det[j, :4].view(1, 4)) / gn).view(4).cpu().numpy()
|
||||
bboxes[j, 0] = det[j, 0] * w0/w1
|
||||
bboxes[j, 1] = det[j, 1] * h0/h1
|
||||
bboxes[j, 2] = det[j, 2] * w0/w1 - bboxes[j, 0]
|
||||
bboxes[j, 3] = det[j, 3] * h0/h1 - bboxes[j, 1]
|
||||
scores[j] = det[j, 4]
|
||||
# landmarks = (det[j, 5:15].view(1, 10) / gn_lks).view(5,2).cpu().numpy()
|
||||
kpss[j, :, :] = det[j, 5:15].reshape(5, 2) * np.array([[w0/w1,h0/h1]])
|
||||
# class_num = det[j, 15].cpu().numpy()
|
||||
# orgimg = show_results(orgimg, xywh, conf, landmarks, class_num)
|
||||
return bboxes, kpss, scores
|
||||
@ -1 +1,2 @@
|
||||
from .pipeline_hunyuan_video import HunyuanVideoPipeline
|
||||
from .pipeline_hunyuan_video_audio import HunyuanVideoAudioPipeline
|
||||
@ -1142,7 +1142,7 @@ class HunyuanVideoPipeline(DiffusionPipeline):
|
||||
|
||||
target_dtype = PRECISION_TO_TYPE[precision]
|
||||
autocast_enabled = target_dtype != torch.float32 and not disable_autocast
|
||||
vae_dtype = PRECISION_TO_TYPE[vae_precision]
|
||||
vae_dtype = self.vae._model_dtype # PRECISION_TO_TYPE[vae_precision]
|
||||
vae_autocast_enabled = vae_dtype != torch.float32 and not disable_autocast
|
||||
|
||||
# 7. Denoising loop
|
||||
@ -1262,6 +1262,7 @@ class HunyuanVideoPipeline(DiffusionPipeline):
|
||||
guidance=guidance_expand,
|
||||
pipeline=self,
|
||||
x_id=j,
|
||||
step_no=i,
|
||||
callback = callback,
|
||||
)
|
||||
if self._interrupt:
|
||||
@ -1290,6 +1291,7 @@ class HunyuanVideoPipeline(DiffusionPipeline):
|
||||
freqs_sin=freqs_cis[1], # [seqlen, head_dim]
|
||||
guidance=guidance_expand,
|
||||
pipeline=self,
|
||||
step_no=i,
|
||||
callback = callback,
|
||||
)
|
||||
if self._interrupt:
|
||||
@ -1404,7 +1406,6 @@ class HunyuanVideoPipeline(DiffusionPipeline):
|
||||
else:
|
||||
image = latents
|
||||
|
||||
image = (image / 2 + 0.5).clamp(0, 1)
|
||||
# we always cast to float32 as this does not cause significant overhead and is compatible with bfloa16
|
||||
image = image.cpu().float()
|
||||
|
||||
|
||||
1359
hyvideo/diffusion/pipelines/pipeline_hunyuan_video_audio.py
Normal file
1359
hyvideo/diffusion/pipelines/pipeline_hunyuan_video_audio.py
Normal file
File diff suppressed because it is too large
Load Diff
@ -5,7 +5,7 @@ import functools
|
||||
from typing import List, Optional, Tuple, Union
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from einops import rearrange
|
||||
import torch
|
||||
import torch.distributed as dist
|
||||
from hyvideo.constants import PROMPT_TEMPLATE, NEGATIVE_PROMPT, PRECISION_TO_TYPE, NEGATIVE_PROMPT_I2V
|
||||
@ -16,11 +16,34 @@ from hyvideo.utils.data_utils import align_to, get_closest_ratio, generate_crop_
|
||||
from hyvideo.modules.posemb_layers import get_nd_rotary_pos_embed, get_nd_rotary_pos_embed_new
|
||||
from hyvideo.diffusion.schedulers import FlowMatchDiscreteScheduler
|
||||
from hyvideo.diffusion.pipelines import HunyuanVideoPipeline
|
||||
from hyvideo.diffusion.pipelines import HunyuanVideoAudioPipeline
|
||||
from PIL import Image
|
||||
import numpy as np
|
||||
import torchvision.transforms as transforms
|
||||
import cv2
|
||||
from wan.utils.utils import resize_lanczos, calculate_new_dimensions
|
||||
from hyvideo.data_kits.audio_preprocessor import encode_audio, get_facemask
|
||||
from transformers import WhisperModel
|
||||
from transformers import AutoFeatureExtractor
|
||||
from hyvideo.data_kits.face_align import AlignImage
|
||||
import librosa
|
||||
|
||||
def get_audio_feature(feature_extractor, audio_path, duration):
|
||||
audio_input, sampling_rate = librosa.load(audio_path, duration=duration, sr=16000)
|
||||
assert sampling_rate == 16000
|
||||
|
||||
audio_features = []
|
||||
window = 750*640
|
||||
for i in range(0, len(audio_input), window):
|
||||
audio_feature = feature_extractor(audio_input[i:i+window],
|
||||
sampling_rate=sampling_rate,
|
||||
return_tensors="pt",
|
||||
device="cuda"
|
||||
).input_features
|
||||
audio_features.append(audio_feature)
|
||||
|
||||
audio_features = torch.cat(audio_features, dim=-1)
|
||||
return audio_features, len(audio_input) // 640
|
||||
|
||||
def pad_image(crop_img, size, color=(255, 255, 255), resize_ratio=1):
|
||||
crop_h, crop_w = crop_img.shape[:2]
|
||||
@ -212,6 +235,14 @@ def patched_llava_forward(
|
||||
image_hidden_states=image_features if pixel_values is not None else None,
|
||||
)
|
||||
|
||||
def adapt_avatar_model(model):
|
||||
modules_dict= { k: m for k, m in model.named_modules()}
|
||||
for model_layer, avatar_layer in model.double_stream_map.items():
|
||||
module = modules_dict[f"audio_adapter_blocks.{avatar_layer}"]
|
||||
target = modules_dict[f"double_blocks.{model_layer}"]
|
||||
setattr(target, "audio_adapter", module )
|
||||
delattr(model, "audio_adapter_blocks")
|
||||
|
||||
class DataPreprocess(object):
|
||||
def __init__(self):
|
||||
self.llava_size = (336, 336)
|
||||
@ -223,12 +254,18 @@ class DataPreprocess(object):
|
||||
]
|
||||
)
|
||||
|
||||
def get_batch(self, image , size):
|
||||
def get_batch(self, image , size, pad = False):
|
||||
image = np.asarray(image)
|
||||
llava_item_image = pad_image(image.copy(), self.llava_size)
|
||||
if pad:
|
||||
llava_item_image = pad_image(image.copy(), self.llava_size)
|
||||
else:
|
||||
llava_item_image = image.copy()
|
||||
uncond_llava_item_image = np.ones_like(llava_item_image) * 255
|
||||
cat_item_image = pad_image(image.copy(), size)
|
||||
|
||||
if pad:
|
||||
cat_item_image = pad_image(image.copy(), size)
|
||||
else:
|
||||
cat_item_image = image.copy()
|
||||
llava_item_tensor = self.llava_transform(Image.fromarray(llava_item_image.astype(np.uint8)))
|
||||
uncond_llava_item_tensor = self.llava_transform(Image.fromarray(uncond_llava_item_image))
|
||||
cat_item_tensor = torch.from_numpy(cat_item_image.copy()).permute((2, 0, 1)) / 255.0
|
||||
@ -243,6 +280,8 @@ class Inference(object):
|
||||
def __init__(
|
||||
self,
|
||||
i2v,
|
||||
custom,
|
||||
avatar,
|
||||
enable_cfg,
|
||||
vae,
|
||||
vae_kwargs,
|
||||
@ -250,9 +289,14 @@ class Inference(object):
|
||||
model,
|
||||
text_encoder_2=None,
|
||||
pipeline=None,
|
||||
feature_extractor=None,
|
||||
wav2vec=None,
|
||||
align_instance=None,
|
||||
device=None,
|
||||
):
|
||||
self.i2v = i2v
|
||||
self.custom = custom
|
||||
self.avatar = avatar
|
||||
self.enable_cfg = enable_cfg
|
||||
self.vae = vae
|
||||
self.vae_kwargs = vae_kwargs
|
||||
@ -263,8 +307,11 @@ class Inference(object):
|
||||
self.model = model
|
||||
self.pipeline = pipeline
|
||||
|
||||
self.device = "cuda"
|
||||
self.feature_extractor=feature_extractor
|
||||
self.wav2vec=wav2vec
|
||||
self.align_instance=align_instance
|
||||
|
||||
self.device = "cuda"
|
||||
|
||||
|
||||
@classmethod
|
||||
@ -285,15 +332,21 @@ class Inference(object):
|
||||
i2v_condition_type = None
|
||||
i2v_mode = "i2v" in model_filepath[0]
|
||||
custom = False
|
||||
avatar = False
|
||||
if i2v_mode:
|
||||
model_id = "HYVideo-T/2"
|
||||
i2v_condition_type = "token_replace"
|
||||
elif "custom" in model_filepath[0]:
|
||||
model_id = "HYVideo-T/2-custom"
|
||||
custom = True
|
||||
elif "avatar" in model_filepath[0]:
|
||||
model_id = "HYVideo-T/2-avatar"
|
||||
text_len = 256
|
||||
avatar = True
|
||||
else:
|
||||
model_id = "HYVideo-T/2-cfgdistill"
|
||||
|
||||
|
||||
if i2v_mode and i2v_condition_type == "latent_concat":
|
||||
in_channels = latent_channels * 2 + 1
|
||||
image_embed_interleave = 2
|
||||
@ -323,11 +376,11 @@ class Inference(object):
|
||||
from mmgp import offload
|
||||
# model = Inference.load_state_dict(args, model, model_filepath)
|
||||
|
||||
# model_filepath ="c:/temp/hc/mp_rank_00_model_states.pt"
|
||||
# model_filepath ="c:/temp/avatar/mp_rank_00_model_states.pt"
|
||||
offload.load_model_data(model, model_filepath, pinToMemory = pinToMemory, partialPinning = partialPinning)
|
||||
pass
|
||||
# offload.save_model(model, "hunyuan_video_custom_720_bf16.safetensors")
|
||||
# offload.save_model(model, "hunyuan_video_custom_720_quanto_bf16_int8.safetensors", do_quantize= True)
|
||||
# offload.save_model(model, "hunyuan_video_avatar_720_bf16.safetensors")
|
||||
# offload.save_model(model, "hunyuan_video_avatar_720_quanto_bf16_int8.safetensors", do_quantize= True)
|
||||
|
||||
model.mixed_precision = mixed_precision_transformer
|
||||
|
||||
@ -338,9 +391,12 @@ class Inference(object):
|
||||
|
||||
# ============================= Build extra models ========================
|
||||
# VAE
|
||||
if custom:
|
||||
if custom or avatar:
|
||||
vae_configpath = "ckpts/hunyuan_video_custom_VAE_config.json"
|
||||
vae_filepath = "ckpts/hunyuan_video_custom_VAE_fp32.safetensors"
|
||||
# elif avatar:
|
||||
# vae_configpath = "ckpts/config_vae_avatar.json"
|
||||
# vae_filepath = "ckpts/vae_avatar.pt"
|
||||
else:
|
||||
vae_configpath = "ckpts/hunyuan_video_VAE_config.json"
|
||||
vae_filepath = "ckpts/hunyuan_video_VAE_fp32.safetensors"
|
||||
@ -350,6 +406,7 @@ class Inference(object):
|
||||
|
||||
vae, _, s_ratio, t_ratio = load_vae( "884-16c-hy", vae_path= vae_filepath, vae_config_path= vae_configpath, vae_precision= vae_precision, device= "cpu", )
|
||||
|
||||
vae._model_dtype = torch.float32 if VAE_dtype == torch.float32 else (torch.float16 if avatar else torch.bfloat16)
|
||||
vae._model_dtype = torch.float32 if VAE_dtype == torch.float32 else torch.bfloat16
|
||||
vae_kwargs = {"s_ratio": s_ratio, "t_ratio": t_ratio}
|
||||
enable_cfg = False
|
||||
@ -359,7 +416,7 @@ class Inference(object):
|
||||
tokenizer = "llm-i2v"
|
||||
prompt_template = "dit-llm-encode-i2v"
|
||||
prompt_template_video = "dit-llm-encode-video-i2v"
|
||||
elif custom :
|
||||
elif custom or avatar :
|
||||
text_encoder = "llm-i2v"
|
||||
tokenizer = "llm-i2v"
|
||||
prompt_template = "dit-llm-encode"
|
||||
@ -411,14 +468,33 @@ class Inference(object):
|
||||
device="cpu",
|
||||
)
|
||||
|
||||
feature_extractor = None
|
||||
wav2vec = None
|
||||
align_instance = None
|
||||
|
||||
if avatar:
|
||||
feature_extractor = AutoFeatureExtractor.from_pretrained("ckpts/whisper-tiny/")
|
||||
wav2vec = WhisperModel.from_pretrained("ckpts/whisper-tiny/").to(device="cpu", dtype=torch.float32)
|
||||
wav2vec._model_dtype = torch.float32
|
||||
wav2vec.requires_grad_(False)
|
||||
align_instance = AlignImage("cuda", det_path="ckpts/det_align/detface.pt")
|
||||
align_instance.facedet.model.to("cpu")
|
||||
|
||||
adapt_avatar_model(model)
|
||||
|
||||
return cls(
|
||||
i2v=i2v_mode,
|
||||
custom=custom,
|
||||
avatar=avatar,
|
||||
enable_cfg = enable_cfg,
|
||||
vae=vae,
|
||||
vae_kwargs=vae_kwargs,
|
||||
text_encoder=text_encoder,
|
||||
text_encoder_2=text_encoder_2,
|
||||
model=model,
|
||||
feature_extractor=feature_extractor,
|
||||
wav2vec=wav2vec,
|
||||
align_instance=align_instance,
|
||||
device=device,
|
||||
)
|
||||
|
||||
@ -428,6 +504,8 @@ class HunyuanVideoSampler(Inference):
|
||||
def __init__(
|
||||
self,
|
||||
i2v,
|
||||
custom,
|
||||
avatar,
|
||||
enable_cfg,
|
||||
vae,
|
||||
vae_kwargs,
|
||||
@ -435,10 +513,15 @@ class HunyuanVideoSampler(Inference):
|
||||
model,
|
||||
text_encoder_2=None,
|
||||
pipeline=None,
|
||||
feature_extractor=None,
|
||||
wav2vec=None,
|
||||
align_instance=None,
|
||||
device=0,
|
||||
):
|
||||
super().__init__(
|
||||
i2v,
|
||||
custom,
|
||||
avatar,
|
||||
enable_cfg,
|
||||
vae,
|
||||
vae_kwargs,
|
||||
@ -446,12 +529,16 @@ class HunyuanVideoSampler(Inference):
|
||||
model,
|
||||
text_encoder_2=text_encoder_2,
|
||||
pipeline=pipeline,
|
||||
feature_extractor=feature_extractor,
|
||||
wav2vec=wav2vec,
|
||||
align_instance=align_instance,
|
||||
device=device,
|
||||
)
|
||||
|
||||
self.i2v_mode = i2v
|
||||
self.enable_cfg = enable_cfg
|
||||
self.pipeline = self.load_diffusion_pipeline(
|
||||
avatar = self.avatar,
|
||||
vae=self.vae,
|
||||
text_encoder=self.text_encoder,
|
||||
text_encoder_2=self.text_encoder_2,
|
||||
@ -474,6 +561,7 @@ class HunyuanVideoSampler(Inference):
|
||||
|
||||
def load_diffusion_pipeline(
|
||||
self,
|
||||
avatar,
|
||||
vae,
|
||||
text_encoder,
|
||||
text_encoder_2,
|
||||
@ -491,14 +579,24 @@ class HunyuanVideoSampler(Inference):
|
||||
solver="euler",
|
||||
)
|
||||
|
||||
pipeline = HunyuanVideoPipeline(
|
||||
vae=vae,
|
||||
text_encoder=text_encoder,
|
||||
text_encoder_2=text_encoder_2,
|
||||
transformer=model,
|
||||
scheduler=scheduler,
|
||||
progress_bar_config=progress_bar_config,
|
||||
)
|
||||
if avatar:
|
||||
pipeline = HunyuanVideoAudioPipeline(
|
||||
vae=vae,
|
||||
text_encoder=text_encoder,
|
||||
text_encoder_2=text_encoder_2,
|
||||
transformer=model,
|
||||
scheduler=scheduler,
|
||||
progress_bar_config=progress_bar_config,
|
||||
)
|
||||
else:
|
||||
pipeline = HunyuanVideoPipeline(
|
||||
vae=vae,
|
||||
text_encoder=text_encoder,
|
||||
text_encoder_2=text_encoder_2,
|
||||
transformer=model,
|
||||
scheduler=scheduler,
|
||||
progress_bar_config=progress_bar_config,
|
||||
)
|
||||
|
||||
return pipeline
|
||||
|
||||
@ -588,6 +686,8 @@ class HunyuanVideoSampler(Inference):
|
||||
self,
|
||||
input_prompt,
|
||||
input_ref_images = None,
|
||||
audio_guide = None,
|
||||
fps = 24,
|
||||
height=192,
|
||||
width=336,
|
||||
frame_num=129,
|
||||
@ -617,14 +717,12 @@ class HunyuanVideoSampler(Inference):
|
||||
self.vae.tile_sample_min_size = VAE_tile_size["tile_sample_min_size"]
|
||||
self.vae.tile_latent_min_size = VAE_tile_size["tile_latent_min_size"]
|
||||
self.vae.tile_overlap_factor = VAE_tile_size["tile_overlap_factor"]
|
||||
self.vae.enable_tiling()
|
||||
|
||||
i2v_mode= self.i2v_mode
|
||||
if not self.enable_cfg:
|
||||
guide_scale=1.0
|
||||
|
||||
|
||||
out_dict = dict()
|
||||
|
||||
# ========================================================================
|
||||
# Arguments: seed
|
||||
# ========================================================================
|
||||
@ -663,7 +761,6 @@ class HunyuanVideoSampler(Inference):
|
||||
seed_everything(seed)
|
||||
generator = [torch.Generator("cuda").manual_seed(seed) for seed in seeds]
|
||||
# generator = [torch.Generator(self.device).manual_seed(seed) for seed in seeds]
|
||||
out_dict["seeds"] = seeds
|
||||
|
||||
# ========================================================================
|
||||
# Arguments: target_width, target_height, target_frame_num
|
||||
@ -681,8 +778,6 @@ class HunyuanVideoSampler(Inference):
|
||||
target_width = align_to(width, 16)
|
||||
target_frame_num = frame_num
|
||||
|
||||
out_dict["size"] = (target_height, target_width, target_frame_num)
|
||||
|
||||
if input_ref_images != None:
|
||||
# ip_cfg_scale = 3.0
|
||||
ip_cfg_scale = 0
|
||||
@ -769,28 +864,91 @@ class HunyuanVideoSampler(Inference):
|
||||
if input_ref_images == None:
|
||||
freqs_cos, freqs_sin = self.get_rotary_pos_embed(target_frame_num, target_height, target_width, enable_riflex)
|
||||
else:
|
||||
concat_dict = {'mode': 'timecat-w', 'bias': -1}
|
||||
freqs_cos, freqs_sin = self.get_rotary_pos_embed_new(target_frame_num, target_height, target_width, concat_dict)
|
||||
if self.avatar:
|
||||
w, h = input_ref_images.size
|
||||
target_height, target_width = calculate_new_dimensions(target_height, target_width, h, w, fit_into_canvas)
|
||||
concat_dict = {'mode': 'timecat', 'bias': -1}
|
||||
freqs_cos, freqs_sin = self.get_rotary_pos_embed_new(129, target_height, target_width, concat_dict)
|
||||
else:
|
||||
concat_dict = {'mode': 'timecat-w', 'bias': -1}
|
||||
freqs_cos, freqs_sin = self.get_rotary_pos_embed_new(target_frame_num, target_height, target_width, concat_dict)
|
||||
|
||||
n_tokens = freqs_cos.shape[0]
|
||||
|
||||
|
||||
callback = kwargs.pop("callback", None)
|
||||
callback_steps = kwargs.pop("callback_steps", None)
|
||||
# ========================================================================
|
||||
# Pipeline inference
|
||||
# ========================================================================
|
||||
start_time = time.time()
|
||||
|
||||
|
||||
# "pixel_value_llava": llava_item_tensor.unsqueeze(0),
|
||||
# "uncond_pixel_value_llava": uncond_llava_item_tensor.unsqueeze(0),
|
||||
# 'pixel_value_ref': cat_item_tensor.unsqueeze(0),
|
||||
pixel_value_llava, uncond_pixel_value_llava, pixel_value_ref = None, None, None
|
||||
if input_ref_images == None:
|
||||
pixel_value_llava, uncond_pixel_value_llava, pixel_value_ref = None, None, None
|
||||
name = None
|
||||
else:
|
||||
pixel_value_llava, uncond_pixel_value_llava, pixel_value_ref = DataPreprocess().get_batch(input_ref_images, (target_width, target_height))
|
||||
pixel_value_llava, uncond_pixel_value_llava, pixel_value_ref = DataPreprocess().get_batch(input_ref_images, (target_width, target_height), pad = self.custom)
|
||||
|
||||
ref_latents, uncond_audio_prompts, audio_prompts, face_masks, motion_exp, motion_pose = None, None, None, None, None, None
|
||||
|
||||
if audio_guide != None:
|
||||
if n_prompt == None or len(n_prompt) == 0:
|
||||
n_prompt = "Aerial view, aerial view, overexposed, low quality, deformation, a poor composition, bad hands, bad teeth, bad eyes, bad limbs, distortion, blurring, Lens changes"
|
||||
|
||||
uncond_pixel_value_llava = pixel_value_llava.clone()
|
||||
|
||||
pixel_value_ref = pixel_value_ref.unsqueeze(0)
|
||||
self.align_instance.facedet.model.to("cuda")
|
||||
face_masks = get_facemask(pixel_value_ref.to("cuda")*255, self.align_instance, area=3.0)
|
||||
# iii = (face_masks.squeeze(0).squeeze(0).permute(1,2,0).repeat(1,1,3)*255).cpu().numpy().astype(np.uint8)
|
||||
# image = Image.fromarray(iii)
|
||||
# image.save("mask.png")
|
||||
# jjj = (pixel_value_ref.squeeze(0).squeeze(0).permute(1,2,0)*255).cpu().numpy().astype(np.uint8)
|
||||
|
||||
self.align_instance.facedet.model.to("cpu")
|
||||
# pixel_value_ref = pixel_value_ref.clone().repeat(1,129,1,1,1)
|
||||
|
||||
pixel_value_ref = pixel_value_ref.repeat(1,1+4*2,1,1,1)
|
||||
pixel_value_ref = pixel_value_ref * 2 - 1
|
||||
pixel_value_ref_for_vae = rearrange(pixel_value_ref, "b f c h w -> b c f h w")
|
||||
|
||||
vae_dtype = self.vae.dtype
|
||||
with torch.autocast(device_type="cuda", dtype=vae_dtype, enabled=vae_dtype != torch.float32):
|
||||
ref_latents = self.vae.encode(pixel_value_ref_for_vae).latent_dist.sample()
|
||||
ref_latents = torch.cat( [ref_latents[:,:, :1], ref_latents[:,:, 1:2].repeat(1,1,31,1,1), ref_latents[:,:, -1:]], dim=2)
|
||||
pixel_value_ref, pixel_value_ref_for_vae = None, None
|
||||
|
||||
if hasattr(self.vae.config, 'shift_factor') and self.vae.config.shift_factor:
|
||||
ref_latents.sub_(self.vae.config.shift_factor).mul_(self.vae.config.scaling_factor)
|
||||
else:
|
||||
ref_latents.mul_(self.vae.config.scaling_factor)
|
||||
|
||||
# out_latents= ref_latents / self.vae.config.scaling_factor
|
||||
# image = self.vae.decode(out_latents, return_dict=False, generator=generator)[0]
|
||||
# image = image.clamp(-1, 1)
|
||||
# from wan.utils.utils import cache_video
|
||||
# cache_video( tensor=image, save_file="decode.mp4", fps=25, nrow=1, normalize=True, value_range=(-1, 1))
|
||||
|
||||
|
||||
face_masks = torch.nn.functional.interpolate(face_masks.float().squeeze(2),
|
||||
(ref_latents.shape[-2],
|
||||
ref_latents.shape[-1]),
|
||||
mode="bilinear").unsqueeze(2).to(dtype=ref_latents.dtype)
|
||||
|
||||
audio_input, audio_len = get_audio_feature(self.feature_extractor, audio_guide, duration = frame_num/fps )
|
||||
audio_prompts = audio_input[0]
|
||||
weight_dtype = audio_prompts.dtype
|
||||
|
||||
motion_pose = np.array([25] * 4)
|
||||
motion_exp = np.array([30] * 4)
|
||||
motion_pose = torch.from_numpy(motion_pose).unsqueeze(0)
|
||||
motion_exp = torch.from_numpy(motion_exp).unsqueeze(0)
|
||||
audio_prompts = encode_audio(self.wav2vec, audio_prompts.to(dtype=self.wav2vec.dtype), fps, num_frames=audio_len)
|
||||
audio_prompts = audio_prompts.to(self.model.dtype)
|
||||
if audio_prompts.shape[1] <= 129:
|
||||
audio_prompts = torch.cat([audio_prompts, torch.zeros_like(audio_prompts[:, :1]).repeat(1,129-audio_prompts.shape[1], 1, 1, 1)], dim=1)
|
||||
else:
|
||||
audio_prompts = torch.cat([audio_prompts, torch.zeros_like(audio_prompts[:, :1]).repeat(1, 5, 1, 1, 1)], dim=1)
|
||||
uncond_audio_prompts = torch.zeros_like(audio_prompts[:,:129])
|
||||
# target_frame_num = min(target_frame_num, audio_len)
|
||||
samples = self.pipeline(
|
||||
prompt=input_prompt,
|
||||
height=target_height,
|
||||
@ -803,9 +961,18 @@ class HunyuanVideoSampler(Inference):
|
||||
generator=generator,
|
||||
output_type="pil",
|
||||
name = name,
|
||||
pixel_value_llava = pixel_value_llava,
|
||||
uncond_pixel_value_llava=uncond_pixel_value_llava,
|
||||
pixel_value_ref=pixel_value_ref,
|
||||
|
||||
pixel_value_ref = pixel_value_ref,
|
||||
ref_latents=ref_latents, # [1, 16, 1, h//8, w//8]
|
||||
pixel_value_llava=pixel_value_llava, # [1, 3, 336, 336]
|
||||
uncond_pixel_value_llava=uncond_pixel_value_llava,
|
||||
face_masks=face_masks, # [b f h w]
|
||||
audio_prompts=audio_prompts,
|
||||
uncond_audio_prompts=uncond_audio_prompts,
|
||||
motion_exp=motion_exp,
|
||||
motion_pose=motion_pose,
|
||||
fps= torch.from_numpy(np.array(fps)),
|
||||
|
||||
denoise_strength=denoise_strength,
|
||||
ip_cfg_scale=ip_cfg_scale,
|
||||
freqs_cis=(freqs_cos, freqs_sin),
|
||||
@ -825,9 +992,9 @@ class HunyuanVideoSampler(Inference):
|
||||
callback = callback,
|
||||
callback_steps = callback_steps,
|
||||
)[0]
|
||||
gen_time = time.time() - start_time
|
||||
|
||||
if samples == None:
|
||||
return None
|
||||
samples = samples.sub_(0.5).mul_(2).squeeze(0)
|
||||
samples = samples.squeeze(0)
|
||||
|
||||
return samples
|
||||
|
||||
220
hyvideo/modules/audio_adapters.py
Normal file
220
hyvideo/modules/audio_adapters.py
Normal file
@ -0,0 +1,220 @@
|
||||
"""
|
||||
This module provides the implementation of an Audio Projection Model, which is designed for
|
||||
audio processing tasks. The model takes audio embeddings as input and outputs context tokens
|
||||
that can be used for various downstream applications, such as audio analysis or synthesis.
|
||||
|
||||
The AudioProjModel class is based on the ModelMixin class from the diffusers library, which
|
||||
provides a foundation for building custom models. This implementation includes multiple linear
|
||||
layers with ReLU activation functions and a LayerNorm for normalization.
|
||||
|
||||
Key Features:
|
||||
- Audio embedding input with flexible sequence length and block structure.
|
||||
- Multiple linear layers for feature transformation.
|
||||
- ReLU activation for non-linear transformation.
|
||||
- LayerNorm for stabilizing and speeding up training.
|
||||
- Rearrangement of input embeddings to match the model's expected input shape.
|
||||
- Customizable number of blocks, channels, and context tokens for adaptability.
|
||||
|
||||
The module is structured to be easily integrated into larger systems or used as a standalone
|
||||
component for audio feature extraction and processing.
|
||||
|
||||
Classes:
|
||||
- AudioProjModel: A class representing the audio projection model with configurable parameters.
|
||||
|
||||
Functions:
|
||||
- (none)
|
||||
|
||||
Dependencies:
|
||||
- torch: For tensor operations and neural network components.
|
||||
- diffusers: For the ModelMixin base class.
|
||||
- einops: For tensor rearrangement operations.
|
||||
|
||||
"""
|
||||
|
||||
import torch
|
||||
from diffusers import ModelMixin
|
||||
from einops import rearrange
|
||||
|
||||
import math
|
||||
import torch.nn as nn
|
||||
|
||||
class AudioProjNet2(ModelMixin):
|
||||
"""Audio Projection Model
|
||||
|
||||
This class defines an audio projection model that takes audio embeddings as input
|
||||
and produces context tokens as output. The model is based on the ModelMixin class
|
||||
and consists of multiple linear layers and activation functions. It can be used
|
||||
for various audio processing tasks.
|
||||
|
||||
Attributes:
|
||||
seq_len (int): The length of the audio sequence.
|
||||
blocks (int): The number of blocks in the audio projection model.
|
||||
channels (int): The number of channels in the audio projection model.
|
||||
intermediate_dim (int): The intermediate dimension of the model.
|
||||
context_tokens (int): The number of context tokens in the output.
|
||||
output_dim (int): The output dimension of the context tokens.
|
||||
|
||||
Methods:
|
||||
__init__(self, seq_len=5, blocks=12, channels=768, intermediate_dim=512, context_tokens=32, output_dim=768):
|
||||
Initializes the AudioProjModel with the given parameters.
|
||||
forward(self, audio_embeds):
|
||||
Defines the forward pass for the AudioProjModel.
|
||||
Parameters:
|
||||
audio_embeds (torch.Tensor): The input audio embeddings with shape (batch_size, video_length, blocks, channels).
|
||||
Returns:
|
||||
context_tokens (torch.Tensor): The output context tokens with shape (batch_size, video_length, context_tokens, output_dim).
|
||||
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
seq_len=5,
|
||||
blocks=12, # add a new parameter blocks
|
||||
channels=768, # add a new parameter channels
|
||||
intermediate_dim=512,
|
||||
output_dim=768,
|
||||
context_tokens=4,
|
||||
):
|
||||
super().__init__()
|
||||
|
||||
self.seq_len = seq_len
|
||||
self.blocks = blocks
|
||||
self.channels = channels
|
||||
self.input_dim = (
|
||||
seq_len * blocks * channels
|
||||
)
|
||||
self.intermediate_dim = intermediate_dim
|
||||
self.context_tokens = context_tokens
|
||||
self.output_dim = output_dim
|
||||
|
||||
# define multiple linear layers
|
||||
self.proj1 = nn.Linear(self.input_dim, intermediate_dim)
|
||||
self.proj2 = nn.Linear(intermediate_dim, intermediate_dim)
|
||||
self.proj3 = nn.Linear(intermediate_dim, context_tokens * output_dim)
|
||||
|
||||
self.norm = nn.LayerNorm(output_dim)
|
||||
|
||||
|
||||
def forward(self, audio_embeds):
|
||||
|
||||
video_length = audio_embeds.shape[1]
|
||||
audio_embeds = rearrange(audio_embeds, "bz f w b c -> (bz f) w b c")
|
||||
batch_size, window_size, blocks, channels = audio_embeds.shape
|
||||
audio_embeds = audio_embeds.view(batch_size, window_size * blocks * channels)
|
||||
|
||||
audio_embeds = torch.relu(self.proj1(audio_embeds))
|
||||
audio_embeds = torch.relu(self.proj2(audio_embeds))
|
||||
|
||||
context_tokens = self.proj3(audio_embeds).reshape(
|
||||
batch_size, self.context_tokens, self.output_dim
|
||||
)
|
||||
context_tokens = self.norm(context_tokens)
|
||||
out_all = rearrange(
|
||||
context_tokens, "(bz f) m c -> bz f m c", f=video_length
|
||||
)
|
||||
|
||||
return out_all
|
||||
|
||||
|
||||
def reshape_tensor(x, heads):
|
||||
bs, length, width = x.shape
|
||||
# (bs, length, width) --> (bs, length, n_heads, dim_per_head)
|
||||
x = x.view(bs, length, heads, -1)
|
||||
# (bs, length, n_heads, dim_per_head) --> (bs, n_heads, length, dim_per_head)
|
||||
x = x.transpose(1, 2)
|
||||
# (bs, n_heads, length, dim_per_head)
|
||||
x = x.reshape(bs, heads, length, -1)
|
||||
return x
|
||||
|
||||
|
||||
class PerceiverAttentionCA(nn.Module):
|
||||
def __init__(self, *, dim=3072, dim_head=1024, heads=33):
|
||||
super().__init__()
|
||||
self.scale = dim_head ** -0.5
|
||||
self.dim_head = dim_head
|
||||
self.heads = heads
|
||||
inner_dim = dim_head #* heads
|
||||
|
||||
self.norm1 = nn.LayerNorm(dim)
|
||||
self.norm2 = nn.LayerNorm(dim)
|
||||
|
||||
self.to_q = nn.Linear(dim, inner_dim, bias=False)
|
||||
self.to_kv = nn.Linear(dim, inner_dim * 2, bias=False)
|
||||
self.to_out = nn.Linear(inner_dim, dim, bias=False)
|
||||
|
||||
import torch.nn.init as init
|
||||
init.zeros_(self.to_out.weight)
|
||||
if self.to_out.bias is not None:
|
||||
init.zeros_(self.to_out.bias)
|
||||
|
||||
def forward(self, x, latents):
|
||||
"""
|
||||
Args:
|
||||
x (torch.Tensor): image features
|
||||
shape (b, t, aa, D)
|
||||
latent (torch.Tensor): latent features
|
||||
shape (b, t, hw, D)
|
||||
"""
|
||||
x = self.norm1(x)
|
||||
latents = self.norm2(latents)
|
||||
# print("latents shape: ", latents.shape)
|
||||
# print("x shape: ", x.shape)
|
||||
q = self.to_q(latents)
|
||||
k, v = self.to_kv(x).chunk(2, dim=-1)
|
||||
|
||||
|
||||
# attention
|
||||
scale = 1 / math.sqrt(math.sqrt(self.dim_head))
|
||||
weight = (q * scale) @ (k * scale).transpose(-2, -1) # More stable with f16 than dividing afterwards
|
||||
weight = torch.softmax(weight.float(), dim=-1).type(weight.dtype)
|
||||
out = weight @ v
|
||||
|
||||
# out = out.permute(0, 2, 1, 3)
|
||||
return self.to_out(out)
|
||||
#def forward(self, x, latents):
|
||||
# """
|
||||
# Args:
|
||||
# x (torch.Tensor): image features
|
||||
# shape (b, t, aa, D)
|
||||
# latent (torch.Tensor): latent features
|
||||
# shape (b, t, hw, D)
|
||||
# """
|
||||
# if get_sequence_parallel_state():
|
||||
# sp_size = nccl_info.sp_size
|
||||
# sp_rank = nccl_info.rank_within_group
|
||||
# print("rank:", latents.shape, sp_size, sp_rank)
|
||||
# latents = torch.chunk(latents, sp_size, dim=1)[sp_rank]
|
||||
|
||||
# x = self.norm1(x)
|
||||
# latents = self.norm2(latents)
|
||||
# # print("latents shape: ", latents.shape)
|
||||
# # print("x shape: ", x.shape)
|
||||
# q = self.to_q(latents)
|
||||
# k, v = self.to_kv(x).chunk(2, dim=-1)
|
||||
|
||||
# # print("q, k, v: ", q.shape, k.shape, v.shape)
|
||||
|
||||
# # attention
|
||||
# #scale = 1 / math.sqrt(math.sqrt(self.dim_head))
|
||||
# #weight = (q * scale) @ (k * scale).transpose(-2, -1) # More stable with f16 than dividing afterwards
|
||||
# #weight = torch.softmax(weight.float(), dim=-1).type(weight.dtype)
|
||||
# #out = weight @ v
|
||||
# def shrink_head(encoder_state, dim):
|
||||
# local_heads = encoder_state.shape[dim] // nccl_info.sp_size
|
||||
# return encoder_state.narrow(dim, nccl_info.rank_within_group * local_heads, local_heads)
|
||||
|
||||
# if get_sequence_parallel_state():
|
||||
# # batch_size, seq_len, attn_heads, head_dim
|
||||
# q = all_to_all_4D(q, scatter_dim=2, gather_dim=1) # [2, 32256, 24, 128]
|
||||
# k = shrink_head(k ,dim=2)
|
||||
# v = shrink_head(v ,dim=2)
|
||||
# qkv = torch.stack([query, key, value], dim=2)
|
||||
# attn = flash_attn_no_pad(qkv, causal=False, dropout_p=0.0, softmax_scale=None)
|
||||
# # out = out.permute(0, 2, 1, 3)
|
||||
# #b, s, a, d = attn.shape
|
||||
# #attn = attn.reshape(b, s, -1)
|
||||
#
|
||||
# out = self.to_out(attn)
|
||||
# if get_sequence_parallel_state():
|
||||
# out = all_gather(out, dim=1)
|
||||
# return out
|
||||
@ -53,10 +53,11 @@ class PatchEmbed(nn.Module):
|
||||
|
||||
def forward(self, x):
|
||||
x = self.proj(x)
|
||||
shape = x.shape
|
||||
if self.flatten:
|
||||
x = x.flatten(2).transpose(1, 2) # BCHW -> BNC
|
||||
x = self.norm(x)
|
||||
return x
|
||||
return x, shape
|
||||
|
||||
|
||||
class TextProjection(nn.Module):
|
||||
|
||||
@ -19,6 +19,7 @@ from .token_refiner import SingleTokenRefiner
|
||||
import numpy as np
|
||||
from mmgp import offload
|
||||
from wan.modules.attention import pay_attention
|
||||
from .audio_adapters import AudioProjNet2, PerceiverAttentionCA
|
||||
|
||||
def get_linear_split_map():
|
||||
hidden_size = 3072
|
||||
@ -589,7 +590,8 @@ class HYVideoDiffusionTransformer(ModelMixin, ConfigMixin):
|
||||
use_attention_mask: bool = True,
|
||||
dtype: Optional[torch.dtype] = None,
|
||||
device: Optional[torch.device] = None,
|
||||
attention_mode: Optional[str] = "sdpa"
|
||||
attention_mode: Optional[str] = "sdpa",
|
||||
avatar = False,
|
||||
):
|
||||
factory_kwargs = {"device": device, "dtype": dtype}
|
||||
super().__init__()
|
||||
@ -708,6 +710,45 @@ class HYVideoDiffusionTransformer(ModelMixin, ConfigMixin):
|
||||
get_activation_layer("silu"),
|
||||
**factory_kwargs,
|
||||
)
|
||||
avatar_audio = avatar
|
||||
if avatar_audio:
|
||||
self.ref_in = PatchEmbed(
|
||||
self.patch_size, self.in_channels, self.hidden_size, **factory_kwargs
|
||||
)
|
||||
|
||||
# -------------------- audio_proj_model --------------------
|
||||
self.audio_proj = AudioProjNet2(seq_len=10, blocks=5, channels=384, intermediate_dim=1024, output_dim=3072, context_tokens=4)
|
||||
|
||||
# -------------------- motion-embeder --------------------
|
||||
self.motion_exp = TimestepEmbedder(
|
||||
self.hidden_size // 4,
|
||||
get_activation_layer("silu"),
|
||||
**factory_kwargs
|
||||
)
|
||||
self.motion_pose = TimestepEmbedder(
|
||||
self.hidden_size // 4,
|
||||
get_activation_layer("silu"),
|
||||
**factory_kwargs
|
||||
)
|
||||
|
||||
self.fps_proj = TimestepEmbedder(
|
||||
self.hidden_size,
|
||||
get_activation_layer("silu"),
|
||||
**factory_kwargs
|
||||
)
|
||||
|
||||
self.before_proj = nn.Linear(self.hidden_size, self.hidden_size)
|
||||
|
||||
# -------------------- audio_insert_model --------------------
|
||||
self.double_stream_list = [1, 3, 5, 7, 9, 11, 13, 15, 17, 19]
|
||||
self.single_stream_list = []
|
||||
self.double_stream_map = {str(i): j for j, i in enumerate(self.double_stream_list)}
|
||||
self.single_stream_map = {str(i): j+len(self.double_stream_list) for j, i in enumerate(self.single_stream_list)}
|
||||
|
||||
self.audio_adapter_blocks = nn.ModuleList([
|
||||
PerceiverAttentionCA(dim=3072, dim_head=1024, heads=33) for _ in range(len(self.double_stream_list) + len(self.single_stream_list))
|
||||
])
|
||||
|
||||
|
||||
|
||||
def lock_layers_dtypes(self, dtype = torch.float32):
|
||||
@ -750,11 +791,17 @@ class HYVideoDiffusionTransformer(ModelMixin, ConfigMixin):
|
||||
guidance: torch.Tensor = None, # Guidance for modulation, should be cfg_scale x 1000.
|
||||
pipeline=None,
|
||||
x_id = 0,
|
||||
step_no = 0,
|
||||
callback = None,
|
||||
audio_prompts = None,
|
||||
motion_exp = None,
|
||||
motion_pose = None,
|
||||
fps = None,
|
||||
face_mask = None,
|
||||
) -> Union[torch.Tensor, Dict[str, torch.Tensor]]:
|
||||
|
||||
img = x
|
||||
batch_no, _, ot, oh, ow = x.shape
|
||||
bsz, _, ot, oh, ow = x.shape
|
||||
del x
|
||||
txt = text_states
|
||||
tt, th, tw = (
|
||||
@ -765,6 +812,17 @@ class HYVideoDiffusionTransformer(ModelMixin, ConfigMixin):
|
||||
|
||||
# Prepare modulation vectors.
|
||||
vec = self.time_in(t)
|
||||
if motion_exp != None:
|
||||
vec += self.motion_exp(motion_exp.view(-1)).view(bsz, -1) # (b, 3072)
|
||||
if motion_pose != None:
|
||||
vec += self.motion_pose(motion_pose.view(-1)).view(bsz, -1) # (b, 3072)
|
||||
if fps != None:
|
||||
vec += self.fps_proj(fps) # (b, 3072)
|
||||
if audio_prompts != None:
|
||||
audio_feature_all = self.audio_proj(audio_prompts)
|
||||
audio_feature_pad = audio_feature_all[:,:1].repeat(1,3,1,1)
|
||||
audio_feature_all_insert = torch.cat([audio_feature_pad, audio_feature_all], dim=1).view(bsz, ot, 16, 3072)
|
||||
audio_feature_all = None
|
||||
|
||||
if self.i2v_condition_type == "token_replace":
|
||||
token_replace_t = torch.zeros_like(t)
|
||||
@ -777,7 +835,6 @@ class HYVideoDiffusionTransformer(ModelMixin, ConfigMixin):
|
||||
# token_replace_mask_txt = None
|
||||
|
||||
# text modulation
|
||||
# vec = vec + self.vector_in(text_states_2)
|
||||
vec_2 = self.vector_in(text_states_2)
|
||||
del text_states_2
|
||||
vec += vec_2
|
||||
@ -793,12 +850,17 @@ class HYVideoDiffusionTransformer(ModelMixin, ConfigMixin):
|
||||
)
|
||||
|
||||
# our timestep_embedding is merged into guidance_in(TimestepEmbedder)
|
||||
vec = vec + self.guidance_in(guidance)
|
||||
vec += self.guidance_in(guidance)
|
||||
|
||||
# Embed image and text.
|
||||
img = self.img_in(img)
|
||||
if ref_latents != None:
|
||||
ref_latents = self.img_in(ref_latents)
|
||||
img, shape_mask = self.img_in(img)
|
||||
if audio_prompts != None:
|
||||
ref_latents_first = ref_latents[:, :, :1].clone()
|
||||
ref_latents,_ = self.ref_in(ref_latents)
|
||||
ref_latents_first,_ = self.img_in(ref_latents_first)
|
||||
elif ref_latents != None:
|
||||
ref_latents, _ = self.img_in(ref_latents)
|
||||
|
||||
if self.text_projection == "linear":
|
||||
txt = self.txt_in(txt)
|
||||
elif self.text_projection == "single_refiner":
|
||||
@ -808,7 +870,18 @@ class HYVideoDiffusionTransformer(ModelMixin, ConfigMixin):
|
||||
f"Unsupported text_projection: {self.text_projection}"
|
||||
)
|
||||
|
||||
if ref_latents == None:
|
||||
if audio_prompts != None:
|
||||
img += self.before_proj(ref_latents)
|
||||
ref_length = ref_latents_first.shape[-2] # [b s c]
|
||||
img = torch.cat([ref_latents_first, img], dim=-2) # t c
|
||||
img_len = img.shape[1]
|
||||
mask_len = img_len - ref_length
|
||||
if face_mask.shape[2] == 1:
|
||||
face_mask = face_mask.repeat(1,1,ot,1,1) # repeat if number of mask frame is 1
|
||||
face_mask = torch.nn.functional.interpolate(face_mask, size=[ot, shape_mask[-2], shape_mask[-1]], mode="nearest")
|
||||
# face_mask = face_mask.view(-1,mask_len,1).repeat(1,1,img.shape[-1]).type_as(img)
|
||||
face_mask = face_mask.view(-1,mask_len,1).type_as(img)
|
||||
elif ref_latents == None:
|
||||
ref_length = None
|
||||
else:
|
||||
ref_length = ref_latents.shape[-2]
|
||||
@ -828,13 +901,13 @@ class HYVideoDiffusionTransformer(ModelMixin, ConfigMixin):
|
||||
if x_id == 0:
|
||||
self.should_calc = True
|
||||
inp = img[0:1]
|
||||
vec_ = vec
|
||||
vec_ = vec[0:1]
|
||||
( img_mod1_shift, img_mod1_scale, _ , _ , _ , _ , ) = self.double_blocks[0].img_mod(vec_).chunk(6, dim=-1)
|
||||
normed_inp = self.double_blocks[0].img_norm1(inp)
|
||||
normed_inp = normed_inp.to(torch.bfloat16)
|
||||
modulated_inp = modulate( normed_inp, shift=img_mod1_shift, scale=img_mod1_scale )
|
||||
del normed_inp, img_mod1_shift, img_mod1_scale
|
||||
if self.teacache_counter <= self.teacache_start_step or self.teacache_counter == self.num_steps-1:
|
||||
if step_no <= self.teacache_start_step or step_no == self.num_steps-1:
|
||||
self.accumulated_rel_l1_distance = 0
|
||||
else:
|
||||
coefficients = [7.33226126e+02, -4.01131952e+02, 6.75869174e+01, -3.14987800e+00, 9.61237896e-02]
|
||||
@ -846,9 +919,6 @@ class HYVideoDiffusionTransformer(ModelMixin, ConfigMixin):
|
||||
else:
|
||||
self.accumulated_rel_l1_distance = 0
|
||||
self.previous_modulated_input = modulated_inp
|
||||
self.teacache_counter += 1
|
||||
if self.teacache_counter == self.num_steps:
|
||||
self.teacache_counter = 0
|
||||
else:
|
||||
self.should_calc = True
|
||||
|
||||
@ -859,7 +929,7 @@ class HYVideoDiffusionTransformer(ModelMixin, ConfigMixin):
|
||||
self.previous_residual[x_id] = None
|
||||
ori_img = img[0:1].clone()
|
||||
# --------------------- Pass through DiT blocks ------------------------
|
||||
for _, block in enumerate(self.double_blocks):
|
||||
for layer_num, block in enumerate(self.double_blocks):
|
||||
for i in range(len(img)):
|
||||
if callback != None:
|
||||
callback(-1, None, False, True)
|
||||
@ -880,6 +950,16 @@ class HYVideoDiffusionTransformer(ModelMixin, ConfigMixin):
|
||||
|
||||
img[i], txt[i] = block(*double_block_args)
|
||||
double_block_args = None
|
||||
# insert audio feature to img
|
||||
if audio_prompts != None:
|
||||
audio_adapter = getattr(self.double_blocks[layer_num], "audio_adapter", None)
|
||||
if audio_adapter != None:
|
||||
real_img = img[i:i+1,ref_length:].view(1, ot, -1, 3072)
|
||||
real_img = audio_adapter(audio_feature_all_insert[i:i+1], real_img).view(1, -1, 3072)
|
||||
real_img *= face_mask[i:i+1]
|
||||
img[i:i+1, ref_length:] += real_img
|
||||
real_img = None
|
||||
|
||||
|
||||
for _, block in enumerate(self.single_blocks):
|
||||
for i in range(len(img)):
|
||||
@ -932,6 +1012,7 @@ class HYVideoDiffusionTransformer(ModelMixin, ConfigMixin):
|
||||
img = torch.cat(img_list)
|
||||
img_list = None
|
||||
|
||||
# img = self.unpatchify(img, tt, th, tw)
|
||||
img = self.unpatchify(img, tt, th, tw)
|
||||
|
||||
return img
|
||||
@ -1015,5 +1096,14 @@ HUNYUAN_VIDEO_CONFIG = {
|
||||
"heads_num": 24,
|
||||
"mlp_width_ratio": 4,
|
||||
},
|
||||
'HYVideo-T/2-avatar': { # 9.0B / 12.5B
|
||||
'mm_double_blocks_depth': 20,
|
||||
'mm_single_blocks_depth': 40,
|
||||
'rope_dim_list': [16, 56, 56],
|
||||
'hidden_size': 3072,
|
||||
'heads_num': 24,
|
||||
'mlp_width_ratio': 4,
|
||||
'avatar': True,
|
||||
},
|
||||
|
||||
}
|
||||
89
wgp.py
89
wgp.py
@ -190,15 +190,19 @@ def process_prompt_and_add_tasks(state, model_choice):
|
||||
if video_length > sliding_window_size:
|
||||
gr.Info(f"The Number of Frames to generate ({video_length}) is greater than the Sliding Window Size ({sliding_window_size}) , multiple Windows will be generated")
|
||||
|
||||
if "phantom" in model_filename or "hunyuan_video_custom" in model_filename:
|
||||
if "phantom" in model_filename or "hunyuan_video_custom" in model_filename or "hunyuan_video_avatar" in model_filename:
|
||||
image_refs = inputs["image_refs"]
|
||||
|
||||
audio_guide = inputs["audio_guide"]
|
||||
if image_refs == None :
|
||||
gr.Info("You must provide an Image Reference")
|
||||
return
|
||||
if len(image_refs) > 1 and "hunyuan_video_custom" in model_filename:
|
||||
gr.Info("Only one Image Reference (a person) is supported for the moment by Hunyuan Custom")
|
||||
if len(image_refs) > 1 and ("hunyuan_video_custom" in model_filename or "hunyuan_video_avatar" in model_filename):
|
||||
gr.Info("Only one Image Reference (a person) is supported for the moment by Hunyuan Custom / Avatar")
|
||||
return
|
||||
if audio_guide == None and "hunyuan_video_avatar" in model_filename:
|
||||
gr.Info("You must provide an audio file")
|
||||
return
|
||||
|
||||
if any(isinstance(image[0], str) for image in image_refs) :
|
||||
gr.Info("Reference Image should be an Image")
|
||||
return
|
||||
@ -1539,7 +1543,9 @@ wan_choices_i2v=["ckpts/wan2.1_image2video_480p_14B_mbf16.safetensors", "ckpts/w
|
||||
ltxv_choices= ["ckpts/ltxv_0.9.7_13B_dev_bf16.safetensors", "ckpts/ltxv_0.9.7_13B_dev_quanto_bf16_int8.safetensors", "ckpts/ltxv_0.9.7_13B_distilled_lora128_bf16.safetensors"]
|
||||
|
||||
hunyuan_choices= ["ckpts/hunyuan_video_720_bf16.safetensors", "ckpts/hunyuan_video_720_quanto_int8.safetensors", "ckpts/hunyuan_video_i2v_720_bf16v2.safetensors", "ckpts/hunyuan_video_i2v_720_quanto_int8v2.safetensors",
|
||||
"ckpts/hunyuan_video_custom_720_bf16.safetensors", "ckpts/hunyuan_video_custom_720_quanto_bf16_int8.safetensors" ]
|
||||
"ckpts/hunyuan_video_custom_720_bf16.safetensors", "ckpts/hunyuan_video_custom_720_quanto_bf16_int8.safetensors",
|
||||
"ckpts/hunyuan_video_avatar_720_bf16.safetensors", "ckpts/hunyuan_video_avatar_720_quanto_bf16_int8.safetensors",
|
||||
]
|
||||
|
||||
transformer_choices = wan_choices_t2v + wan_choices_i2v + ltxv_choices + hunyuan_choices
|
||||
def get_dependent_models(model_filename, quantization, dtype_policy ):
|
||||
@ -1549,12 +1555,13 @@ def get_dependent_models(model_filename, quantization, dtype_policy ):
|
||||
return [get_model_filename("ltxv_13B", quantization, dtype_policy)]
|
||||
else:
|
||||
return []
|
||||
model_types = [ "t2v_1.3B", "t2v", "i2v", "i2v_720p", "flf2v_720p", "vace_1.3B","vace_14B","moviigen", "phantom_1.3B", "phantom_14B", "fantasy", "fun_inp_1.3B", "fun_inp", "recam_1.3B", "sky_df_1.3B", "sky_df_14B", "sky_df_720p_14B", "ltxv_13B", "ltxv_13B_distilled", "hunyuan", "hunyuan_i2v", "hunyuan_custom"]
|
||||
model_types = [ "t2v_1.3B", "t2v", "i2v", "i2v_720p", "flf2v_720p", "vace_1.3B","vace_14B","moviigen", "phantom_1.3B", "phantom_14B", "fantasy", "fun_inp_1.3B", "fun_inp", "recam_1.3B", "sky_df_1.3B", "sky_df_14B", "sky_df_720p_14B", "ltxv_13B", "ltxv_13B_distilled", "hunyuan", "hunyuan_i2v", "hunyuan_custom", "hunyuan_avatar"]
|
||||
model_signatures = {"t2v": "text2video_14B", "t2v_1.3B" : "text2video_1.3B", "fun_inp_1.3B" : "Fun_InP_1.3B", "fun_inp" : "Fun_InP_14B",
|
||||
"i2v" : "image2video_480p", "i2v_720p" : "image2video_720p" , "vace_1.3B" : "Vace_1.3B", "vace_14B" : "Vace_14B","recam_1.3B": "recammaster_1.3B",
|
||||
"flf2v_720p" : "FLF2V_720p", "sky_df_1.3B" : "sky_reels2_diffusion_forcing_1.3B", "sky_df_14B" : "sky_reels2_diffusion_forcing_14B",
|
||||
"sky_df_720p_14B" : "sky_reels2_diffusion_forcing_720p_14B", "moviigen" :"moviigen",
|
||||
"phantom_1.3B" : "phantom_1.3B", "phantom_14B" : "phantom_14B", "fantasy" : "fantasy", "ltxv_13B" : "ltxv_0.9.7_13B_dev", "ltxv_13B_distilled" : "ltxv_0.9.7_13B_distilled", "hunyuan" : "hunyuan_video_720", "hunyuan_i2v" : "hunyuan_video_i2v_720", "hunyuan_custom" : "hunyuan_video_custom" }
|
||||
"phantom_1.3B" : "phantom_1.3B", "phantom_14B" : "phantom_14B", "fantasy" : "fantasy", "ltxv_13B" : "ltxv_0.9.7_13B_dev", "ltxv_13B_distilled" : "ltxv_0.9.7_13B_distilled",
|
||||
"hunyuan" : "hunyuan_video_720", "hunyuan_i2v" : "hunyuan_video_i2v_720", "hunyuan_custom" : "hunyuan_video_custom", "hunyuan_avatar" : "hunyuan_video_avatar" }
|
||||
|
||||
|
||||
def get_model_type(model_filename):
|
||||
@ -1639,7 +1646,10 @@ def get_model_name(model_filename, description_container = [""]):
|
||||
description = "A good looking image 2 video model, but not so good in prompt adherence."
|
||||
elif "hunyuan_video_custom" in model_filename:
|
||||
model_name = "Hunyuan Video Custom 720p 13B"
|
||||
description = "The Hunyuan Video Custom model is proably the best model to transfer people (only people for the momment) as it is quite good to keep their identity. However it is slow as to get good results, you need to generate 720p videos with 30 steps."
|
||||
description = "The Hunyuan Video Custom model is probably the best model to transfer people (only people for the momment) as it is quite good to keep their identity. However it is slow as to get good results, you need to generate 720p videos with 30 steps."
|
||||
elif "hunyuan_video_avatar" in model_filename:
|
||||
model_name = "Hunyuan Video Avatar 720p 13B"
|
||||
description = "With the Hunyuan Video Avatar model you can animate a person based on the content of an audio input. Please note that the video generator works by processing 128 frames segment at a time (even if you ask less). The good news is that it will concatenate multiple segments for long video generation (max 3 segments recommended as the quality will get worse)."
|
||||
else:
|
||||
model_name = "Wan2.1 text2video"
|
||||
model_name += " 14B" if "14B" in model_filename else " 1.3B"
|
||||
@ -1758,7 +1768,14 @@ def get_default_settings(filename):
|
||||
ui_defaults.update({
|
||||
"guidance_scale": 7.5,
|
||||
"flow_shift": 13,
|
||||
"resolution": "1280x720"
|
||||
"resolution": "1280x720",
|
||||
})
|
||||
elif get_model_type(filename) in ("hunyuan_avatar"):
|
||||
ui_defaults.update({
|
||||
"guidance_scale": 7.5,
|
||||
"flow_shift": 5,
|
||||
"tea_cache_start_step_perc": 25,
|
||||
"video_length": 129,
|
||||
})
|
||||
elif get_model_type(filename) in ("vace_14B"):
|
||||
ui_defaults.update({
|
||||
@ -1954,8 +1971,13 @@ def download_models(transformer_filename):
|
||||
text_encoder_filename = get_hunyuan_text_encoder_filename(text_encoder_quantization)
|
||||
model_def = {
|
||||
"repoId" : "DeepBeepMeep/HunyuanVideo",
|
||||
"sourceFolderList" : [ "llava-llama-3-8b", "clip_vit_large_patch14", "" ],
|
||||
"fileList" :[ ["config.json", "special_tokens_map.json", "tokenizer.json", "tokenizer_config.json", "preprocessor_config.json"] + computeList(text_encoder_filename) , ["config.json", "merges.txt", "model.safetensors", "preprocessor_config.json", "special_tokens_map.json", "tokenizer.json", "tokenizer_config.json", "vocab.json"], [ "hunyuan_video_720_quanto_int8_map.json", "hunyuan_video_custom_VAE_fp32.safetensors", "hunyuan_video_custom_VAE_config.json", "hunyuan_video_VAE_fp32.safetensors", "hunyuan_video_VAE_config.json" , "hunyuan_video_720_quanto_int8_map.json" ] + computeList(transformer_filename) ]
|
||||
"sourceFolderList" : [ "llava-llama-3-8b", "clip_vit_large_patch14", "whisper-tiny" , "det_align", "" ],
|
||||
"fileList" :[ ["config.json", "special_tokens_map.json", "tokenizer.json", "tokenizer_config.json", "preprocessor_config.json"] + computeList(text_encoder_filename) ,
|
||||
["config.json", "merges.txt", "model.safetensors", "preprocessor_config.json", "special_tokens_map.json", "tokenizer.json", "tokenizer_config.json", "vocab.json"],
|
||||
["config.json", "model.safetensors", "preprocessor_config.json", "special_tokens_map.json", "tokenizer_config.json"],
|
||||
["detface.pt"],
|
||||
[ "hunyuan_video_720_quanto_int8_map.json", "hunyuan_video_custom_VAE_fp32.safetensors", "hunyuan_video_custom_VAE_config.json", "hunyuan_video_VAE_fp32.safetensors", "hunyuan_video_VAE_config.json" , "hunyuan_video_720_quanto_int8_map.json" ] + computeList(transformer_filename)
|
||||
]
|
||||
}
|
||||
|
||||
else:
|
||||
@ -2122,6 +2144,14 @@ def load_hunyuan_model(model_filename, quantizeTransformer = False, dtype = torc
|
||||
|
||||
pipe = { "transformer" : hunyuan_model.model, "text_encoder" : hunyuan_model.text_encoder, "text_encoder_2" : hunyuan_model.text_encoder_2, "vae" : hunyuan_model.vae }
|
||||
|
||||
if hunyuan_model.wav2vec != None:
|
||||
pipe["wav2vec"] = hunyuan_model.wav2vec
|
||||
|
||||
|
||||
# if hunyuan_model.align_instance != None:
|
||||
# pipe["align_instance"] = hunyuan_model.align_instance.facedet.model
|
||||
|
||||
|
||||
from hyvideo.modules.models import get_linear_split_map
|
||||
|
||||
split_linear_modules_map = get_linear_split_map()
|
||||
@ -2818,9 +2848,13 @@ def generate_video(
|
||||
hunyuan_t2v = "hunyuan_video_720" in model_filename
|
||||
hunyuan_i2v = "hunyuan_video_i2v" in model_filename
|
||||
hunyuan_custom = "hunyuan_video_custom" in model_filename
|
||||
hunyuan_avatar = "hunyuan_video_avatar" in model_filename
|
||||
fantasy = "fantasy" in model_filename
|
||||
if diffusion_forcing or hunyuan_t2v or hunyuan_i2v or hunyuan_custom:
|
||||
fps = 24
|
||||
elif audio_guide != None:
|
||||
elif hunyuan_avatar:
|
||||
fps = 25
|
||||
elif fantasy:
|
||||
fps = 23
|
||||
elif ltxv:
|
||||
fps = 30
|
||||
@ -2829,11 +2863,13 @@ def generate_video(
|
||||
latent_size = 8 if ltxv else 4
|
||||
|
||||
original_image_refs = image_refs
|
||||
if image_refs != None and len(image_refs) > 0 and (hunyuan_custom or phantom or vace):
|
||||
send_cmd("progress", [0, get_latest_status(state, "Removing Images References Background")])
|
||||
if image_refs != None and len(image_refs) > 0 and (hunyuan_custom or phantom or hunyuan_avatar or vace):
|
||||
if hunyuan_avatar: remove_background_images_ref = 0
|
||||
if remove_background_images_ref > 0:
|
||||
send_cmd("progress", [0, get_latest_status(state, "Removing Images References Background")])
|
||||
os.environ["U2NET_HOME"] = os.path.join(os.getcwd(), "ckpts", "rembg")
|
||||
from wan.utils.utils import resize_and_remove_background
|
||||
image_refs = resize_and_remove_background(image_refs, width, height, remove_background_images_ref, fit_into_canvas= not vace)
|
||||
image_refs = resize_and_remove_background(image_refs, width, height, remove_background_images_ref, fit_into_canvas= not (vace or hunyuan_avatar) ) # no fit for vace ref images as it is done later
|
||||
update_task_thumbnails(task, locals())
|
||||
send_cmd("output")
|
||||
|
||||
@ -2866,13 +2902,14 @@ def generate_video(
|
||||
audio_proj_split = None
|
||||
audio_scale = None
|
||||
audio_context_lens = None
|
||||
if audio_guide != None:
|
||||
if (fantasy or hunyuan_avatar) and audio_guide != None:
|
||||
from fantasytalking.infer import parse_audio
|
||||
import librosa
|
||||
duration = librosa.get_duration(path=audio_guide)
|
||||
current_video_length = min(int(fps * duration // 4) * 4 + 5, current_video_length)
|
||||
audio_proj_split, audio_context_lens = parse_audio(audio_guide, num_frames= current_video_length, fps= fps, device= processing_device )
|
||||
audio_scale = 1.0
|
||||
current_video_length = min(int(fps * duration // 4) * 4 + 5, current_video_length)
|
||||
if fantasy:
|
||||
audio_proj_split, audio_context_lens = parse_audio(audio_guide, num_frames= current_video_length, fps= fps, device= processing_device )
|
||||
audio_scale = 1.0
|
||||
|
||||
import random
|
||||
if seed == None or seed <0:
|
||||
@ -2990,7 +3027,7 @@ def generate_video(
|
||||
if reuse_frames > 0:
|
||||
return_latent_slice = slice(-(reuse_frames - 1 + discard_last_frames ) // latent_size, None if discard_last_frames == 0 else -(discard_last_frames // latent_size) )
|
||||
|
||||
if hunyuan_custom:
|
||||
if hunyuan_custom or hunyuan_avatar:
|
||||
src_ref_images = image_refs
|
||||
elif phantom:
|
||||
src_ref_images = image_refs.copy() if image_refs != None else None
|
||||
@ -3098,6 +3135,7 @@ def generate_video(
|
||||
cfg_star_switch = cfg_star_switch,
|
||||
cfg_zero_step = cfg_zero_step,
|
||||
audio_cfg_scale= audio_guidance_scale,
|
||||
audio_guide=audio_guide,
|
||||
audio_proj= audio_proj_split,
|
||||
audio_scale= audio_scale,
|
||||
audio_context_lens= audio_context_lens,
|
||||
@ -4502,6 +4540,7 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non
|
||||
hunyuan_t2v = "hunyuan_video_720" in model_filename
|
||||
hunyuan_i2v = "hunyuan_video_i2v" in model_filename
|
||||
hunyuan_video_custom = "hunyuan_video_custom" in model_filename
|
||||
hunyuan_video_avatar = "hunyuan_video_avatar" in model_filename
|
||||
sliding_window_enabled = vace or diffusion_forcing or ltxv
|
||||
new_line_text = "each new line of prompt will be used for a window" if sliding_window_enabled else "each new line of prompt will generate a new video"
|
||||
|
||||
@ -4575,7 +4614,7 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non
|
||||
model_mode = gr.Dropdown(value=None, visible=False)
|
||||
keep_frames_video_source = gr.Text(visible=False)
|
||||
|
||||
with gr.Column(visible= vace or phantom or hunyuan_video_custom) as video_prompt_column:
|
||||
with gr.Column(visible= vace or phantom or hunyuan_video_custom or hunyuan_video_avatar) as video_prompt_column:
|
||||
video_prompt_type_value= ui_defaults.get("video_prompt_type","")
|
||||
video_prompt_type = gr.Text(value= video_prompt_type_value, visible= False)
|
||||
with gr.Row():
|
||||
@ -4624,14 +4663,14 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non
|
||||
("Keep it for first Image (landscape) and remove it for other Images (objects / faces)", 2),
|
||||
],
|
||||
value=ui_defaults.get("remove_background_images_ref",1),
|
||||
label="Remove Background of Images References", scale = 3, visible= "I" in video_prompt_type_value
|
||||
label="Remove Background of Images References", scale = 3, visible= "I" in video_prompt_type_value and not hunyuan_video_avatar
|
||||
)
|
||||
|
||||
# remove_background_images_ref = gr.Checkbox(value=ui_defaults.get("remove_background_images_ref",1), label= "Remove Background of Images References", visible= "I" in video_prompt_type_value, scale =1 )
|
||||
|
||||
|
||||
video_mask = gr.Video(label= "Video Mask (for Inpainting or Outpaing, white pixels = Mask)", visible= "M" in video_prompt_type_value, value= ui_defaults.get("video_mask", None))
|
||||
audio_guide = gr.Audio(value= ui_defaults.get("audio_guide", None), type="filepath", label="Voice to follow", show_download_button= True, visible= fantasy )
|
||||
audio_guide = gr.Audio(value= ui_defaults.get("audio_guide", None), type="filepath", label="Voice to follow", show_download_button= True, visible= fantasy or hunyuan_video_avatar )
|
||||
|
||||
advanced_prompt = advanced_ui
|
||||
prompt_vars=[]
|
||||
@ -4720,6 +4759,8 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non
|
||||
video_length = gr.Slider(17, 737, value=ui_defaults.get("video_length", 81), step=4, label="Number of frames (16 = 1s)", interactive= True)
|
||||
elif fantasy:
|
||||
video_length = gr.Slider(5, 233, value=ui_defaults.get("video_length", 81), step=4, label="Number of frames (23 = 1s)", interactive= True)
|
||||
elif hunyuan_video_avatar:
|
||||
video_length = gr.Slider(5, 401, value=ui_defaults.get("video_length", 81), step=4, label="Number of frames (25 = 1s)", interactive= True)
|
||||
elif hunyuan_t2v or hunyuan_i2v:
|
||||
video_length = gr.Slider(5, 337, value=ui_defaults.get("video_length", 97), step=4, label="Number of frames (24 = 1s)", interactive= True)
|
||||
else:
|
||||
@ -4809,7 +4850,7 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non
|
||||
)
|
||||
|
||||
with gr.Tab("Quality", visible = not ltxv) as quality_tab:
|
||||
with gr.Column(visible = not (hunyuan_i2v or hunyuan_t2v or hunyuan_video_custom) ) as skip_layer_guidance_row:
|
||||
with gr.Column(visible = not (hunyuan_i2v or hunyuan_t2v or hunyuan_video_custom or hunyuan_video_avatar) ) as skip_layer_guidance_row:
|
||||
gr.Markdown("<B>Skip Layer Guidance (improves video quality)</B>")
|
||||
with gr.Row():
|
||||
slg_switch = gr.Dropdown(
|
||||
|
||||
Loading…
Reference in New Issue
Block a user