Support for Hunyuan Video Avatar

This commit is contained in:
DeepBeepMeep 2025-06-05 15:35:58 +02:00
parent 7670af9610
commit 1976868f6a
15 changed files with 2562 additions and 81 deletions

View File

@ -21,7 +21,7 @@ WanGP supports the Wan (and derived models), Hunyuan Video and LTV Video models
## 🔥 Latest News!!
* May 26 2025: 👋 WanGP v5.31 : Added Phantom 14B, a model that you can use to transfer objects / people in the video. My preference goes to Vace that remains the king of controlnets.
* May 28 2025: 👋 WanGP v5.31 : Added Phantom 14B, a model that you can use to transfer objects / people in the video. My preference goes to Vace that remains the king of controlnets.
* May 26 2025: 👋 WanGP v5.3 : Happy with a Video generation and want to do more generations using the same settings but you can't remember what you did or you find it to hard to copy / paste one per one each setting from the file metadata ? Rejoice ! There are now multiple ways to turn this tedious process into a one click task:
- Select one Video recently generated in the Video Gallery and click *Use Selected Video Settings*
- Click *Drop File Here* and select a Video you saved somewhere, if the settings metadata have been saved with the Video you will be able to extract them automatically

View File

@ -0,0 +1,170 @@
import os
import cv2
import math
import json
import torch
import random
import librosa
import traceback
import torchvision
import numpy as np
import pandas as pd
from PIL import Image
from einops import rearrange
from torch.utils.data import Dataset
from decord import VideoReader, cpu
from transformers import CLIPImageProcessor
import torchvision.transforms as transforms
from torchvision.transforms import ToPILImage
def get_audio_feature(feature_extractor, audio_path):
audio_input, sampling_rate = librosa.load(audio_path, sr=16000)
assert sampling_rate == 16000
audio_features = []
window = 750*640
for i in range(0, len(audio_input), window):
audio_feature = feature_extractor(audio_input[i:i+window],
sampling_rate=sampling_rate,
return_tensors="pt",
).input_features
audio_features.append(audio_feature)
audio_features = torch.cat(audio_features, dim=-1)
return audio_features, len(audio_input) // 640
class VideoAudioTextLoaderVal(Dataset):
def __init__(
self,
image_size: int,
meta_file: str,
**kwargs,
):
super().__init__()
self.meta_file = meta_file
self.image_size = image_size
self.text_encoder = kwargs.get("text_encoder", None) # llava_text_encoder
self.text_encoder_2 = kwargs.get("text_encoder_2", None) # clipL_text_encoder
self.feature_extractor = kwargs.get("feature_extractor", None)
self.meta_files = []
csv_data = pd.read_csv(meta_file)
for idx in range(len(csv_data)):
self.meta_files.append(
{
"videoid": str(csv_data["videoid"][idx]),
"image_path": str(csv_data["image"][idx]),
"audio_path": str(csv_data["audio"][idx]),
"prompt": str(csv_data["prompt"][idx]),
"fps": float(csv_data["fps"][idx])
}
)
self.llava_transform = transforms.Compose(
[
transforms.Resize((336, 336), interpolation=transforms.InterpolationMode.BILINEAR),
transforms.ToTensor(),
transforms.Normalize((0.48145466, 0.4578275, 0.4082107), (0.26862954, 0.26130258, 0.27577711)),
]
)
self.clip_image_processor = CLIPImageProcessor()
self.device = torch.device("cuda")
self.weight_dtype = torch.float16
def __len__(self):
return len(self.meta_files)
@staticmethod
def get_text_tokens(text_encoder, description, dtype_encode="video"):
text_inputs = text_encoder.text2tokens(description, data_type=dtype_encode)
text_ids = text_inputs["input_ids"].squeeze(0)
text_mask = text_inputs["attention_mask"].squeeze(0)
return text_ids, text_mask
def get_batch_data(self, idx):
meta_file = self.meta_files[idx]
videoid = meta_file["videoid"]
image_path = meta_file["image_path"]
audio_path = meta_file["audio_path"]
prompt = "Authentic, Realistic, Natural, High-quality, Lens-Fixed, " + meta_file["prompt"]
fps = meta_file["fps"]
img_size = self.image_size
ref_image = Image.open(image_path).convert('RGB')
# Resize reference image
w, h = ref_image.size
scale = img_size / min(w, h)
new_w = round(w * scale / 64) * 64
new_h = round(h * scale / 64) * 64
if img_size == 704:
img_size_long = 1216
if new_w * new_h > img_size * img_size_long:
import math
scale = math.sqrt(img_size * img_size_long / w / h)
new_w = round(w * scale / 64) * 64
new_h = round(h * scale / 64) * 64
ref_image = ref_image.resize((new_w, new_h), Image.LANCZOS)
ref_image = np.array(ref_image)
ref_image = torch.from_numpy(ref_image)
audio_input, audio_len = get_audio_feature(self.feature_extractor, audio_path)
audio_prompts = audio_input[0]
motion_bucket_id_heads = np.array([25] * 4)
motion_bucket_id_exps = np.array([30] * 4)
motion_bucket_id_heads = torch.from_numpy(motion_bucket_id_heads)
motion_bucket_id_exps = torch.from_numpy(motion_bucket_id_exps)
fps = torch.from_numpy(np.array(fps))
to_pil = ToPILImage()
pixel_value_ref = rearrange(ref_image.clone().unsqueeze(0), "b h w c -> b c h w") # (b c h w)
pixel_value_ref_llava = [self.llava_transform(to_pil(image)) for image in pixel_value_ref]
pixel_value_ref_llava = torch.stack(pixel_value_ref_llava, dim=0)
pixel_value_ref_clip = self.clip_image_processor(
images=Image.fromarray((pixel_value_ref[0].permute(1,2,0)).data.cpu().numpy().astype(np.uint8)),
return_tensors="pt"
).pixel_values[0]
pixel_value_ref_clip = pixel_value_ref_clip.unsqueeze(0)
# Encode text prompts
text_ids, text_mask = self.get_text_tokens(self.text_encoder, prompt)
text_ids_2, text_mask_2 = self.get_text_tokens(self.text_encoder_2, prompt)
# Output batch
batch = {
"text_prompt": prompt, #
"videoid": videoid,
"pixel_value_ref": pixel_value_ref.to(dtype=torch.float16), # 参考图用于vae提特征 (1, 3, h, w), 取值范围(0, 255)
"pixel_value_ref_llava": pixel_value_ref_llava.to(dtype=torch.float16), # 参考图用于llava提特征 (1, 3, 336, 336), 取值范围 = CLIP取值范围
"pixel_value_ref_clip": pixel_value_ref_clip.to(dtype=torch.float16), # 参考图用于clip_image_encoder提特征 (1, 3, 244, 244), 取值范围 = CLIP取值范围
"audio_prompts": audio_prompts.to(dtype=torch.float16),
"motion_bucket_id_heads": motion_bucket_id_heads.to(dtype=text_ids.dtype),
"motion_bucket_id_exps": motion_bucket_id_exps.to(dtype=text_ids.dtype),
"fps": fps.to(dtype=torch.float16),
"text_ids": text_ids.clone(), # 对应llava_text_encoder
"text_mask": text_mask.clone(), # 对应llava_text_encoder
"text_ids_2": text_ids_2.clone(), # 对应clip_text_encoder
"text_mask_2": text_mask_2.clone(), # 对应clip_text_encoder
"audio_len": audio_len,
"image_path": image_path,
"audio_path": audio_path,
}
return batch
def __getitem__(self, idx):
return self.get_batch_data(idx)

View File

@ -0,0 +1,72 @@
import os
import cv2
import json
import time
import decord
import einops
import librosa
import torch
import random
import argparse
import traceback
import numpy as np
from tqdm import tqdm
from PIL import Image
from einops import rearrange
def get_facemask(ref_image, align_instance, area=1.25):
# ref_image: (b f c h w)
bsz, f, c, h, w = ref_image.shape
images = rearrange(ref_image, "b f c h w -> (b f) h w c").data.cpu().numpy().astype(np.uint8)
face_masks = []
for image in images:
image_pil = Image.fromarray(image).convert("RGB")
_, _, bboxes_list = align_instance(np.array(image_pil)[:,:,[2,1,0]], maxface=True)
try:
bboxSrc = bboxes_list[0]
except:
bboxSrc = [0, 0, w, h]
x1, y1, ww, hh = bboxSrc
x2, y2 = x1 + ww, y1 + hh
ww, hh = (x2-x1) * area, (y2-y1) * area
center = [(x2+x1)//2, (y2+y1)//2]
x1 = max(center[0] - ww//2, 0)
y1 = max(center[1] - hh//2, 0)
x2 = min(center[0] + ww//2, w)
y2 = min(center[1] + hh//2, h)
face_mask = np.zeros_like(np.array(image_pil))
face_mask[int(y1):int(y2), int(x1):int(x2)] = 1.0
face_masks.append(torch.from_numpy(face_mask[...,:1]))
face_masks = torch.stack(face_masks, dim=0) # (b*f, h, w, c)
face_masks = rearrange(face_masks, "(b f) h w c -> b c f h w", b=bsz, f=f)
face_masks = face_masks.to(device=ref_image.device, dtype=ref_image.dtype)
return face_masks
def encode_audio(wav2vec, audio_feats, fps, num_frames=129):
if fps == 25:
start_ts = [0]
step_ts = [1]
elif fps == 12.5:
start_ts = [0]
step_ts = [2]
num_frames = min(num_frames, 400)
audio_feats = wav2vec.encoder(audio_feats.unsqueeze(0)[:, :, :3000], output_hidden_states=True).hidden_states
audio_feats = torch.stack(audio_feats, dim=2)
audio_feats = torch.cat([torch.zeros_like(audio_feats[:,:4]), audio_feats], 1)
audio_prompts = []
for bb in range(1):
audio_feats_list = []
for f in range(num_frames):
cur_t = (start_ts[bb] + f * step_ts[bb]) * 2
audio_clip = audio_feats[bb:bb+1, cur_t: cur_t+10]
audio_feats_list.append(audio_clip)
audio_feats_list = torch.stack(audio_feats_list, 1)
audio_prompts.append(audio_feats_list)
audio_prompts = torch.cat(audio_prompts)
return audio_prompts

View File

@ -0,0 +1,41 @@
import os
import cv2
import torch
import numpy as np
import imageio
import torchvision
from einops import rearrange
def save_videos_grid(videos: torch.Tensor, path: str, rescale=False, n_rows=6, fps=8, quality=8):
videos = rearrange(videos, "b c t h w -> t b c h w")
outputs = []
for x in videos:
x = torchvision.utils.make_grid(x, nrow=n_rows)
x = x.transpose(0, 1).transpose(1, 2).squeeze(-1)
if rescale:
x = (x + 1.0) / 2.0 # -1,1 -> 0,1
x = torch.clamp(x,0,1)
x = (x * 255).numpy().astype(np.uint8)
outputs.append(x)
os.makedirs(os.path.dirname(path), exist_ok=True)
imageio.mimsave(path, outputs, fps=fps, quality=quality)
def pad_image(crop_img, size, color=(255, 255, 255), resize_ratio=1):
crop_h, crop_w = crop_img.shape[:2]
target_w, target_h = size
scale_h, scale_w = target_h / crop_h, target_w / crop_w
if scale_w > scale_h:
resize_h = int(target_h*resize_ratio)
resize_w = int(crop_w / crop_h * resize_h)
else:
resize_w = int(target_w*resize_ratio)
resize_h = int(crop_h / crop_w * resize_w)
crop_img = cv2.resize(crop_img, (resize_w, resize_h))
pad_left = (target_w - resize_w) // 2
pad_top = (target_h - resize_h) // 2
pad_right = target_w - resize_w - pad_left
pad_bottom = target_h - resize_h - pad_top
crop_img = cv2.copyMakeBorder(crop_img, pad_top, pad_bottom, pad_left, pad_right, cv2.BORDER_CONSTANT, value=color)
return crop_img

View File

@ -0,0 +1 @@
from .align import AlignImage

View File

@ -0,0 +1,34 @@
import os
import sys
import torch
from .detface import DetFace
class AlignImage(object):
def __init__(self, device='cuda', det_path=''):
self.facedet = DetFace(pt_path=det_path, confThreshold=0.5, nmsThreshold=0.45, device=device)
@torch.no_grad()
def __call__(self, im, maxface=False):
bboxes, kpss, scores = self.facedet.detect(im)
face_num = bboxes.shape[0]
five_pts_list = []
scores_list = []
bboxes_list = []
for i in range(face_num):
five_pts_list.append(kpss[i].reshape(5,2))
scores_list.append(scores[i])
bboxes_list.append(bboxes[i])
if maxface and face_num>1:
max_idx = 0
max_area = (bboxes[0, 2])*(bboxes[0, 3])
for i in range(1, face_num):
area = (bboxes[i,2])*(bboxes[i,3])
if area>max_area:
max_idx = i
five_pts_list = [five_pts_list[max_idx]]
scores_list = [scores_list[max_idx]]
bboxes_list = [bboxes_list[max_idx]]
return five_pts_list, scores_list, bboxes_list

View File

@ -0,0 +1,283 @@
# -*- coding: UTF-8 -*-
import os
import cv2
import numpy as np
import torch
import torchvision
def xyxy2xywh(x):
# Convert nx4 boxes from [x1, y1, x2, y2] to [x, y, w, h] where xy1=top-left, xy2=bottom-right
y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x)
y[:, 0] = (x[:, 0] + x[:, 2]) / 2 # x center
y[:, 1] = (x[:, 1] + x[:, 3]) / 2 # y center
y[:, 2] = x[:, 2] - x[:, 0] # width
y[:, 3] = x[:, 3] - x[:, 1] # height
return y
def xywh2xyxy(x):
# Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right
y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x)
y[:, 0] = x[:, 0] - x[:, 2] / 2 # top left x
y[:, 1] = x[:, 1] - x[:, 3] / 2 # top left y
y[:, 2] = x[:, 0] + x[:, 2] / 2 # bottom right x
y[:, 3] = x[:, 1] + x[:, 3] / 2 # bottom right y
return y
def box_iou(box1, box2):
# https://github.com/pytorch/vision/blob/master/torchvision/ops/boxes.py
"""
Return intersection-over-union (Jaccard index) of boxes.
Both sets of boxes are expected to be in (x1, y1, x2, y2) format.
Arguments:
box1 (Tensor[N, 4])
box2 (Tensor[M, 4])
Returns:
iou (Tensor[N, M]): the NxM matrix containing the pairwise
IoU values for every element in boxes1 and boxes2
"""
def box_area(box):
# box = 4xn
return (box[2] - box[0]) * (box[3] - box[1])
area1 = box_area(box1.T)
area2 = box_area(box2.T)
# inter(N,M) = (rb(N,M,2) - lt(N,M,2)).clamp(0).prod(2)
inter = (torch.min(box1[:, None, 2:], box2[:, 2:]) -
torch.max(box1[:, None, :2], box2[:, :2])).clamp(0).prod(2)
# iou = inter / (area1 + area2 - inter)
return inter / (area1[:, None] + area2 - inter)
def scale_coords(img1_shape, coords, img0_shape, ratio_pad=None):
# Rescale coords (xyxy) from img1_shape to img0_shape
if ratio_pad is None: # calculate from img0_shape
gain = min(img1_shape[0] / img0_shape[0], img1_shape[1] / img0_shape[1]) # gain = old / new
pad = (img1_shape[1] - img0_shape[1] * gain) / 2, (img1_shape[0] - img0_shape[0] * gain) / 2 # wh padding
else:
gain = ratio_pad[0][0]
pad = ratio_pad[1]
coords[:, [0, 2]] -= pad[0] # x padding
coords[:, [1, 3]] -= pad[1] # y padding
coords[:, :4] /= gain
clip_coords(coords, img0_shape)
return coords
def clip_coords(boxes, img_shape):
# Clip bounding xyxy bounding boxes to image shape (height, width)
boxes[:, 0].clamp_(0, img_shape[1]) # x1
boxes[:, 1].clamp_(0, img_shape[0]) # y1
boxes[:, 2].clamp_(0, img_shape[1]) # x2
boxes[:, 3].clamp_(0, img_shape[0]) # y2
def scale_coords_landmarks(img1_shape, coords, img0_shape, ratio_pad=None):
# Rescale coords (xyxy) from img1_shape to img0_shape
if ratio_pad is None: # calculate from img0_shape
gain = min(img1_shape[0] / img0_shape[0], img1_shape[1] / img0_shape[1]) # gain = old / new
pad = (img1_shape[1] - img0_shape[1] * gain) / 2, (img1_shape[0] - img0_shape[0] * gain) / 2 # wh padding
else:
gain = ratio_pad[0][0]
pad = ratio_pad[1]
coords[:, [0, 2, 4, 6, 8]] -= pad[0] # x padding
coords[:, [1, 3, 5, 7, 9]] -= pad[1] # y padding
coords[:, :10] /= gain
#clip_coords(coords, img0_shape)
coords[:, 0].clamp_(0, img0_shape[1]) # x1
coords[:, 1].clamp_(0, img0_shape[0]) # y1
coords[:, 2].clamp_(0, img0_shape[1]) # x2
coords[:, 3].clamp_(0, img0_shape[0]) # y2
coords[:, 4].clamp_(0, img0_shape[1]) # x3
coords[:, 5].clamp_(0, img0_shape[0]) # y3
coords[:, 6].clamp_(0, img0_shape[1]) # x4
coords[:, 7].clamp_(0, img0_shape[0]) # y4
coords[:, 8].clamp_(0, img0_shape[1]) # x5
coords[:, 9].clamp_(0, img0_shape[0]) # y5
return coords
def show_results(img, xywh, conf, landmarks, class_num):
h,w,c = img.shape
tl = 1 or round(0.002 * (h + w) / 2) + 1 # line/font thickness
x1 = int(xywh[0] * w - 0.5 * xywh[2] * w)
y1 = int(xywh[1] * h - 0.5 * xywh[3] * h)
x2 = int(xywh[0] * w + 0.5 * xywh[2] * w)
y2 = int(xywh[1] * h + 0.5 * xywh[3] * h)
cv2.rectangle(img, (x1,y1), (x2, y2), (0,255,0), thickness=tl, lineType=cv2.LINE_AA)
clors = [(255,0,0),(0,255,0),(0,0,255),(255,255,0),(0,255,255)]
for i in range(5):
point_x = int(landmarks[2 * i] * w)
point_y = int(landmarks[2 * i + 1] * h)
cv2.circle(img, (point_x, point_y), tl+1, clors[i], -1)
tf = max(tl - 1, 1) # font thickness
label = str(conf)[:5]
cv2.putText(img, label, (x1, y1 - 2), 0, tl / 3, [225, 255, 255], thickness=tf, lineType=cv2.LINE_AA)
return img
def make_divisible(x, divisor):
# Returns x evenly divisible by divisor
return (x // divisor) * divisor
def non_max_suppression_face(prediction, conf_thres=0.5, iou_thres=0.45, classes=None, agnostic=False, labels=()):
"""Performs Non-Maximum Suppression (NMS) on inference results
Returns:
detections with shape: nx6 (x1, y1, x2, y2, conf, cls)
"""
nc = prediction.shape[2] - 15 # number of classes
xc = prediction[..., 4] > conf_thres # candidates
# Settings
min_wh, max_wh = 2, 4096 # (pixels) minimum and maximum box width and height
# time_limit = 10.0 # seconds to quit after
redundant = True # require redundant detections
multi_label = nc > 1 # multiple labels per box (adds 0.5ms/img)
merge = False # use merge-NMS
# t = time.time()
output = [torch.zeros((0, 16), device=prediction.device)] * prediction.shape[0]
for xi, x in enumerate(prediction): # image index, image inference
# Apply constraints
# x[((x[..., 2:4] < min_wh) | (x[..., 2:4] > max_wh)).any(1), 4] = 0 # width-height
x = x[xc[xi]] # confidence
# Cat apriori labels if autolabelling
if labels and len(labels[xi]):
l = labels[xi]
v = torch.zeros((len(l), nc + 15), device=x.device)
v[:, :4] = l[:, 1:5] # box
v[:, 4] = 1.0 # conf
v[range(len(l)), l[:, 0].long() + 15] = 1.0 # cls
x = torch.cat((x, v), 0)
# If none remain process next image
if not x.shape[0]:
continue
# Compute conf
x[:, 15:] *= x[:, 4:5] # conf = obj_conf * cls_conf
# Box (center x, center y, width, height) to (x1, y1, x2, y2)
box = xywh2xyxy(x[:, :4])
# Detections matrix nx6 (xyxy, conf, landmarks, cls)
if multi_label:
i, j = (x[:, 15:] > conf_thres).nonzero(as_tuple=False).T
x = torch.cat((box[i], x[i, j + 15, None], x[i, 5:15] ,j[:, None].float()), 1)
else: # best class only
conf, j = x[:, 15:].max(1, keepdim=True)
x = torch.cat((box, conf, x[:, 5:15], j.float()), 1)[conf.view(-1) > conf_thres]
# Filter by class
if classes is not None:
x = x[(x[:, 5:6] == torch.tensor(classes, device=x.device)).any(1)]
# If none remain process next image
n = x.shape[0] # number of boxes
if not n:
continue
# Batched NMS
c = x[:, 15:16] * (0 if agnostic else max_wh) # classes
boxes, scores = x[:, :4] + c, x[:, 4] # boxes (offset by class), scores
i = torchvision.ops.nms(boxes, scores, iou_thres) # NMS
#if i.shape[0] > max_det: # limit detections
# i = i[:max_det]
if merge and (1 < n < 3E3): # Merge NMS (boxes merged using weighted mean)
# update boxes as boxes(i,4) = weights(i,n) * boxes(n,4)
iou = box_iou(boxes[i], boxes) > iou_thres # iou matrix
weights = iou * scores[None] # box weights
x[i, :4] = torch.mm(weights, x[:, :4]).float() / weights.sum(1, keepdim=True) # merged boxes
if redundant:
i = i[iou.sum(1) > 1] # require redundancy
output[xi] = x[i]
# if (time.time() - t) > time_limit:
# break # time limit exceeded
return output
class DetFace():
def __init__(self, pt_path, confThreshold=0.5, nmsThreshold=0.45, device='cuda'):
assert os.path.exists(pt_path)
self.inpSize = 416
self.conf_thres = confThreshold
self.iou_thres = nmsThreshold
self.test_device = torch.device(device if torch.cuda.is_available() else "cpu")
self.model = torch.jit.load(pt_path).to(self.test_device)
self.last_w = 416
self.last_h = 416
self.grids = None
@torch.no_grad()
def detect(self, srcimg):
# t0=time.time()
h0, w0 = srcimg.shape[:2] # orig hw
r = self.inpSize / min(h0, w0) # resize image to img_size
h1 = int(h0*r+31)//32*32
w1 = int(w0*r+31)//32*32
img = cv2.resize(srcimg, (w1,h1), interpolation=cv2.INTER_LINEAR)
# Convert
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) # BGR to RGB
# Run inference
img = torch.from_numpy(img).to(self.test_device).permute(2,0,1)
img = img.float()/255 # uint8 to fp16/32 0-1
if img.ndimension() == 3:
img = img.unsqueeze(0)
# Inference
if h1 != self.last_h or w1 != self.last_w or self.grids is None:
grids = []
for scale in [8,16,32]:
ny = h1//scale
nx = w1//scale
yv, xv = torch.meshgrid([torch.arange(ny), torch.arange(nx)])
grid = torch.stack((xv, yv), 2).view((1,1,ny, nx, 2)).float()
grids.append(grid.to(self.test_device))
self.grids = grids
self.last_w = w1
self.last_h = h1
pred = self.model(img, self.grids).cpu()
# Apply NMS
det = non_max_suppression_face(pred, self.conf_thres, self.iou_thres)[0]
# Process detections
# det = pred[0]
bboxes = np.zeros((det.shape[0], 4))
kpss = np.zeros((det.shape[0], 5, 2))
scores = np.zeros((det.shape[0]))
# gn = torch.tensor([w0, h0, w0, h0]).to(pred) # normalization gain whwh
# gn_lks = torch.tensor([w0, h0, w0, h0, w0, h0, w0, h0, w0, h0]).to(pred) # normalization gain landmarks
det = det.cpu().numpy()
for j in range(det.shape[0]):
# xywh = (xyxy2xywh(det[j, :4].view(1, 4)) / gn).view(4).cpu().numpy()
bboxes[j, 0] = det[j, 0] * w0/w1
bboxes[j, 1] = det[j, 1] * h0/h1
bboxes[j, 2] = det[j, 2] * w0/w1 - bboxes[j, 0]
bboxes[j, 3] = det[j, 3] * h0/h1 - bboxes[j, 1]
scores[j] = det[j, 4]
# landmarks = (det[j, 5:15].view(1, 10) / gn_lks).view(5,2).cpu().numpy()
kpss[j, :, :] = det[j, 5:15].reshape(5, 2) * np.array([[w0/w1,h0/h1]])
# class_num = det[j, 15].cpu().numpy()
# orgimg = show_results(orgimg, xywh, conf, landmarks, class_num)
return bboxes, kpss, scores

View File

@ -1 +1,2 @@
from .pipeline_hunyuan_video import HunyuanVideoPipeline
from .pipeline_hunyuan_video_audio import HunyuanVideoAudioPipeline

View File

@ -1142,7 +1142,7 @@ class HunyuanVideoPipeline(DiffusionPipeline):
target_dtype = PRECISION_TO_TYPE[precision]
autocast_enabled = target_dtype != torch.float32 and not disable_autocast
vae_dtype = PRECISION_TO_TYPE[vae_precision]
vae_dtype = self.vae._model_dtype # PRECISION_TO_TYPE[vae_precision]
vae_autocast_enabled = vae_dtype != torch.float32 and not disable_autocast
# 7. Denoising loop
@ -1262,6 +1262,7 @@ class HunyuanVideoPipeline(DiffusionPipeline):
guidance=guidance_expand,
pipeline=self,
x_id=j,
step_no=i,
callback = callback,
)
if self._interrupt:
@ -1290,6 +1291,7 @@ class HunyuanVideoPipeline(DiffusionPipeline):
freqs_sin=freqs_cis[1], # [seqlen, head_dim]
guidance=guidance_expand,
pipeline=self,
step_no=i,
callback = callback,
)
if self._interrupt:
@ -1404,7 +1406,6 @@ class HunyuanVideoPipeline(DiffusionPipeline):
else:
image = latents
image = (image / 2 + 0.5).clamp(0, 1)
# we always cast to float32 as this does not cause significant overhead and is compatible with bfloa16
image = image.cpu().float()

File diff suppressed because it is too large Load Diff

View File

@ -5,7 +5,7 @@ import functools
from typing import List, Optional, Tuple, Union
from pathlib import Path
from einops import rearrange
import torch
import torch.distributed as dist
from hyvideo.constants import PROMPT_TEMPLATE, NEGATIVE_PROMPT, PRECISION_TO_TYPE, NEGATIVE_PROMPT_I2V
@ -16,11 +16,34 @@ from hyvideo.utils.data_utils import align_to, get_closest_ratio, generate_crop_
from hyvideo.modules.posemb_layers import get_nd_rotary_pos_embed, get_nd_rotary_pos_embed_new
from hyvideo.diffusion.schedulers import FlowMatchDiscreteScheduler
from hyvideo.diffusion.pipelines import HunyuanVideoPipeline
from hyvideo.diffusion.pipelines import HunyuanVideoAudioPipeline
from PIL import Image
import numpy as np
import torchvision.transforms as transforms
import cv2
from wan.utils.utils import resize_lanczos, calculate_new_dimensions
from hyvideo.data_kits.audio_preprocessor import encode_audio, get_facemask
from transformers import WhisperModel
from transformers import AutoFeatureExtractor
from hyvideo.data_kits.face_align import AlignImage
import librosa
def get_audio_feature(feature_extractor, audio_path, duration):
audio_input, sampling_rate = librosa.load(audio_path, duration=duration, sr=16000)
assert sampling_rate == 16000
audio_features = []
window = 750*640
for i in range(0, len(audio_input), window):
audio_feature = feature_extractor(audio_input[i:i+window],
sampling_rate=sampling_rate,
return_tensors="pt",
device="cuda"
).input_features
audio_features.append(audio_feature)
audio_features = torch.cat(audio_features, dim=-1)
return audio_features, len(audio_input) // 640
def pad_image(crop_img, size, color=(255, 255, 255), resize_ratio=1):
crop_h, crop_w = crop_img.shape[:2]
@ -212,6 +235,14 @@ def patched_llava_forward(
image_hidden_states=image_features if pixel_values is not None else None,
)
def adapt_avatar_model(model):
modules_dict= { k: m for k, m in model.named_modules()}
for model_layer, avatar_layer in model.double_stream_map.items():
module = modules_dict[f"audio_adapter_blocks.{avatar_layer}"]
target = modules_dict[f"double_blocks.{model_layer}"]
setattr(target, "audio_adapter", module )
delattr(model, "audio_adapter_blocks")
class DataPreprocess(object):
def __init__(self):
self.llava_size = (336, 336)
@ -223,12 +254,18 @@ class DataPreprocess(object):
]
)
def get_batch(self, image , size):
def get_batch(self, image , size, pad = False):
image = np.asarray(image)
llava_item_image = pad_image(image.copy(), self.llava_size)
if pad:
llava_item_image = pad_image(image.copy(), self.llava_size)
else:
llava_item_image = image.copy()
uncond_llava_item_image = np.ones_like(llava_item_image) * 255
cat_item_image = pad_image(image.copy(), size)
if pad:
cat_item_image = pad_image(image.copy(), size)
else:
cat_item_image = image.copy()
llava_item_tensor = self.llava_transform(Image.fromarray(llava_item_image.astype(np.uint8)))
uncond_llava_item_tensor = self.llava_transform(Image.fromarray(uncond_llava_item_image))
cat_item_tensor = torch.from_numpy(cat_item_image.copy()).permute((2, 0, 1)) / 255.0
@ -243,6 +280,8 @@ class Inference(object):
def __init__(
self,
i2v,
custom,
avatar,
enable_cfg,
vae,
vae_kwargs,
@ -250,9 +289,14 @@ class Inference(object):
model,
text_encoder_2=None,
pipeline=None,
feature_extractor=None,
wav2vec=None,
align_instance=None,
device=None,
):
self.i2v = i2v
self.custom = custom
self.avatar = avatar
self.enable_cfg = enable_cfg
self.vae = vae
self.vae_kwargs = vae_kwargs
@ -263,8 +307,11 @@ class Inference(object):
self.model = model
self.pipeline = pipeline
self.device = "cuda"
self.feature_extractor=feature_extractor
self.wav2vec=wav2vec
self.align_instance=align_instance
self.device = "cuda"
@classmethod
@ -285,15 +332,21 @@ class Inference(object):
i2v_condition_type = None
i2v_mode = "i2v" in model_filepath[0]
custom = False
avatar = False
if i2v_mode:
model_id = "HYVideo-T/2"
i2v_condition_type = "token_replace"
elif "custom" in model_filepath[0]:
model_id = "HYVideo-T/2-custom"
custom = True
elif "avatar" in model_filepath[0]:
model_id = "HYVideo-T/2-avatar"
text_len = 256
avatar = True
else:
model_id = "HYVideo-T/2-cfgdistill"
if i2v_mode and i2v_condition_type == "latent_concat":
in_channels = latent_channels * 2 + 1
image_embed_interleave = 2
@ -323,11 +376,11 @@ class Inference(object):
from mmgp import offload
# model = Inference.load_state_dict(args, model, model_filepath)
# model_filepath ="c:/temp/hc/mp_rank_00_model_states.pt"
# model_filepath ="c:/temp/avatar/mp_rank_00_model_states.pt"
offload.load_model_data(model, model_filepath, pinToMemory = pinToMemory, partialPinning = partialPinning)
pass
# offload.save_model(model, "hunyuan_video_custom_720_bf16.safetensors")
# offload.save_model(model, "hunyuan_video_custom_720_quanto_bf16_int8.safetensors", do_quantize= True)
# offload.save_model(model, "hunyuan_video_avatar_720_bf16.safetensors")
# offload.save_model(model, "hunyuan_video_avatar_720_quanto_bf16_int8.safetensors", do_quantize= True)
model.mixed_precision = mixed_precision_transformer
@ -338,9 +391,12 @@ class Inference(object):
# ============================= Build extra models ========================
# VAE
if custom:
if custom or avatar:
vae_configpath = "ckpts/hunyuan_video_custom_VAE_config.json"
vae_filepath = "ckpts/hunyuan_video_custom_VAE_fp32.safetensors"
# elif avatar:
# vae_configpath = "ckpts/config_vae_avatar.json"
# vae_filepath = "ckpts/vae_avatar.pt"
else:
vae_configpath = "ckpts/hunyuan_video_VAE_config.json"
vae_filepath = "ckpts/hunyuan_video_VAE_fp32.safetensors"
@ -350,6 +406,7 @@ class Inference(object):
vae, _, s_ratio, t_ratio = load_vae( "884-16c-hy", vae_path= vae_filepath, vae_config_path= vae_configpath, vae_precision= vae_precision, device= "cpu", )
vae._model_dtype = torch.float32 if VAE_dtype == torch.float32 else (torch.float16 if avatar else torch.bfloat16)
vae._model_dtype = torch.float32 if VAE_dtype == torch.float32 else torch.bfloat16
vae_kwargs = {"s_ratio": s_ratio, "t_ratio": t_ratio}
enable_cfg = False
@ -359,7 +416,7 @@ class Inference(object):
tokenizer = "llm-i2v"
prompt_template = "dit-llm-encode-i2v"
prompt_template_video = "dit-llm-encode-video-i2v"
elif custom :
elif custom or avatar :
text_encoder = "llm-i2v"
tokenizer = "llm-i2v"
prompt_template = "dit-llm-encode"
@ -411,14 +468,33 @@ class Inference(object):
device="cpu",
)
feature_extractor = None
wav2vec = None
align_instance = None
if avatar:
feature_extractor = AutoFeatureExtractor.from_pretrained("ckpts/whisper-tiny/")
wav2vec = WhisperModel.from_pretrained("ckpts/whisper-tiny/").to(device="cpu", dtype=torch.float32)
wav2vec._model_dtype = torch.float32
wav2vec.requires_grad_(False)
align_instance = AlignImage("cuda", det_path="ckpts/det_align/detface.pt")
align_instance.facedet.model.to("cpu")
adapt_avatar_model(model)
return cls(
i2v=i2v_mode,
custom=custom,
avatar=avatar,
enable_cfg = enable_cfg,
vae=vae,
vae_kwargs=vae_kwargs,
text_encoder=text_encoder,
text_encoder_2=text_encoder_2,
model=model,
feature_extractor=feature_extractor,
wav2vec=wav2vec,
align_instance=align_instance,
device=device,
)
@ -428,6 +504,8 @@ class HunyuanVideoSampler(Inference):
def __init__(
self,
i2v,
custom,
avatar,
enable_cfg,
vae,
vae_kwargs,
@ -435,10 +513,15 @@ class HunyuanVideoSampler(Inference):
model,
text_encoder_2=None,
pipeline=None,
feature_extractor=None,
wav2vec=None,
align_instance=None,
device=0,
):
super().__init__(
i2v,
custom,
avatar,
enable_cfg,
vae,
vae_kwargs,
@ -446,12 +529,16 @@ class HunyuanVideoSampler(Inference):
model,
text_encoder_2=text_encoder_2,
pipeline=pipeline,
feature_extractor=feature_extractor,
wav2vec=wav2vec,
align_instance=align_instance,
device=device,
)
self.i2v_mode = i2v
self.enable_cfg = enable_cfg
self.pipeline = self.load_diffusion_pipeline(
avatar = self.avatar,
vae=self.vae,
text_encoder=self.text_encoder,
text_encoder_2=self.text_encoder_2,
@ -474,6 +561,7 @@ class HunyuanVideoSampler(Inference):
def load_diffusion_pipeline(
self,
avatar,
vae,
text_encoder,
text_encoder_2,
@ -491,14 +579,24 @@ class HunyuanVideoSampler(Inference):
solver="euler",
)
pipeline = HunyuanVideoPipeline(
vae=vae,
text_encoder=text_encoder,
text_encoder_2=text_encoder_2,
transformer=model,
scheduler=scheduler,
progress_bar_config=progress_bar_config,
)
if avatar:
pipeline = HunyuanVideoAudioPipeline(
vae=vae,
text_encoder=text_encoder,
text_encoder_2=text_encoder_2,
transformer=model,
scheduler=scheduler,
progress_bar_config=progress_bar_config,
)
else:
pipeline = HunyuanVideoPipeline(
vae=vae,
text_encoder=text_encoder,
text_encoder_2=text_encoder_2,
transformer=model,
scheduler=scheduler,
progress_bar_config=progress_bar_config,
)
return pipeline
@ -588,6 +686,8 @@ class HunyuanVideoSampler(Inference):
self,
input_prompt,
input_ref_images = None,
audio_guide = None,
fps = 24,
height=192,
width=336,
frame_num=129,
@ -617,14 +717,12 @@ class HunyuanVideoSampler(Inference):
self.vae.tile_sample_min_size = VAE_tile_size["tile_sample_min_size"]
self.vae.tile_latent_min_size = VAE_tile_size["tile_latent_min_size"]
self.vae.tile_overlap_factor = VAE_tile_size["tile_overlap_factor"]
self.vae.enable_tiling()
i2v_mode= self.i2v_mode
if not self.enable_cfg:
guide_scale=1.0
out_dict = dict()
# ========================================================================
# Arguments: seed
# ========================================================================
@ -663,7 +761,6 @@ class HunyuanVideoSampler(Inference):
seed_everything(seed)
generator = [torch.Generator("cuda").manual_seed(seed) for seed in seeds]
# generator = [torch.Generator(self.device).manual_seed(seed) for seed in seeds]
out_dict["seeds"] = seeds
# ========================================================================
# Arguments: target_width, target_height, target_frame_num
@ -681,8 +778,6 @@ class HunyuanVideoSampler(Inference):
target_width = align_to(width, 16)
target_frame_num = frame_num
out_dict["size"] = (target_height, target_width, target_frame_num)
if input_ref_images != None:
# ip_cfg_scale = 3.0
ip_cfg_scale = 0
@ -769,28 +864,91 @@ class HunyuanVideoSampler(Inference):
if input_ref_images == None:
freqs_cos, freqs_sin = self.get_rotary_pos_embed(target_frame_num, target_height, target_width, enable_riflex)
else:
concat_dict = {'mode': 'timecat-w', 'bias': -1}
freqs_cos, freqs_sin = self.get_rotary_pos_embed_new(target_frame_num, target_height, target_width, concat_dict)
if self.avatar:
w, h = input_ref_images.size
target_height, target_width = calculate_new_dimensions(target_height, target_width, h, w, fit_into_canvas)
concat_dict = {'mode': 'timecat', 'bias': -1}
freqs_cos, freqs_sin = self.get_rotary_pos_embed_new(129, target_height, target_width, concat_dict)
else:
concat_dict = {'mode': 'timecat-w', 'bias': -1}
freqs_cos, freqs_sin = self.get_rotary_pos_embed_new(target_frame_num, target_height, target_width, concat_dict)
n_tokens = freqs_cos.shape[0]
callback = kwargs.pop("callback", None)
callback_steps = kwargs.pop("callback_steps", None)
# ========================================================================
# Pipeline inference
# ========================================================================
start_time = time.time()
# "pixel_value_llava": llava_item_tensor.unsqueeze(0),
# "uncond_pixel_value_llava": uncond_llava_item_tensor.unsqueeze(0),
# 'pixel_value_ref': cat_item_tensor.unsqueeze(0),
pixel_value_llava, uncond_pixel_value_llava, pixel_value_ref = None, None, None
if input_ref_images == None:
pixel_value_llava, uncond_pixel_value_llava, pixel_value_ref = None, None, None
name = None
else:
pixel_value_llava, uncond_pixel_value_llava, pixel_value_ref = DataPreprocess().get_batch(input_ref_images, (target_width, target_height))
pixel_value_llava, uncond_pixel_value_llava, pixel_value_ref = DataPreprocess().get_batch(input_ref_images, (target_width, target_height), pad = self.custom)
ref_latents, uncond_audio_prompts, audio_prompts, face_masks, motion_exp, motion_pose = None, None, None, None, None, None
if audio_guide != None:
if n_prompt == None or len(n_prompt) == 0:
n_prompt = "Aerial view, aerial view, overexposed, low quality, deformation, a poor composition, bad hands, bad teeth, bad eyes, bad limbs, distortion, blurring, Lens changes"
uncond_pixel_value_llava = pixel_value_llava.clone()
pixel_value_ref = pixel_value_ref.unsqueeze(0)
self.align_instance.facedet.model.to("cuda")
face_masks = get_facemask(pixel_value_ref.to("cuda")*255, self.align_instance, area=3.0)
# iii = (face_masks.squeeze(0).squeeze(0).permute(1,2,0).repeat(1,1,3)*255).cpu().numpy().astype(np.uint8)
# image = Image.fromarray(iii)
# image.save("mask.png")
# jjj = (pixel_value_ref.squeeze(0).squeeze(0).permute(1,2,0)*255).cpu().numpy().astype(np.uint8)
self.align_instance.facedet.model.to("cpu")
# pixel_value_ref = pixel_value_ref.clone().repeat(1,129,1,1,1)
pixel_value_ref = pixel_value_ref.repeat(1,1+4*2,1,1,1)
pixel_value_ref = pixel_value_ref * 2 - 1
pixel_value_ref_for_vae = rearrange(pixel_value_ref, "b f c h w -> b c f h w")
vae_dtype = self.vae.dtype
with torch.autocast(device_type="cuda", dtype=vae_dtype, enabled=vae_dtype != torch.float32):
ref_latents = self.vae.encode(pixel_value_ref_for_vae).latent_dist.sample()
ref_latents = torch.cat( [ref_latents[:,:, :1], ref_latents[:,:, 1:2].repeat(1,1,31,1,1), ref_latents[:,:, -1:]], dim=2)
pixel_value_ref, pixel_value_ref_for_vae = None, None
if hasattr(self.vae.config, 'shift_factor') and self.vae.config.shift_factor:
ref_latents.sub_(self.vae.config.shift_factor).mul_(self.vae.config.scaling_factor)
else:
ref_latents.mul_(self.vae.config.scaling_factor)
# out_latents= ref_latents / self.vae.config.scaling_factor
# image = self.vae.decode(out_latents, return_dict=False, generator=generator)[0]
# image = image.clamp(-1, 1)
# from wan.utils.utils import cache_video
# cache_video( tensor=image, save_file="decode.mp4", fps=25, nrow=1, normalize=True, value_range=(-1, 1))
face_masks = torch.nn.functional.interpolate(face_masks.float().squeeze(2),
(ref_latents.shape[-2],
ref_latents.shape[-1]),
mode="bilinear").unsqueeze(2).to(dtype=ref_latents.dtype)
audio_input, audio_len = get_audio_feature(self.feature_extractor, audio_guide, duration = frame_num/fps )
audio_prompts = audio_input[0]
weight_dtype = audio_prompts.dtype
motion_pose = np.array([25] * 4)
motion_exp = np.array([30] * 4)
motion_pose = torch.from_numpy(motion_pose).unsqueeze(0)
motion_exp = torch.from_numpy(motion_exp).unsqueeze(0)
audio_prompts = encode_audio(self.wav2vec, audio_prompts.to(dtype=self.wav2vec.dtype), fps, num_frames=audio_len)
audio_prompts = audio_prompts.to(self.model.dtype)
if audio_prompts.shape[1] <= 129:
audio_prompts = torch.cat([audio_prompts, torch.zeros_like(audio_prompts[:, :1]).repeat(1,129-audio_prompts.shape[1], 1, 1, 1)], dim=1)
else:
audio_prompts = torch.cat([audio_prompts, torch.zeros_like(audio_prompts[:, :1]).repeat(1, 5, 1, 1, 1)], dim=1)
uncond_audio_prompts = torch.zeros_like(audio_prompts[:,:129])
# target_frame_num = min(target_frame_num, audio_len)
samples = self.pipeline(
prompt=input_prompt,
height=target_height,
@ -803,9 +961,18 @@ class HunyuanVideoSampler(Inference):
generator=generator,
output_type="pil",
name = name,
pixel_value_llava = pixel_value_llava,
uncond_pixel_value_llava=uncond_pixel_value_llava,
pixel_value_ref=pixel_value_ref,
pixel_value_ref = pixel_value_ref,
ref_latents=ref_latents, # [1, 16, 1, h//8, w//8]
pixel_value_llava=pixel_value_llava, # [1, 3, 336, 336]
uncond_pixel_value_llava=uncond_pixel_value_llava,
face_masks=face_masks, # [b f h w]
audio_prompts=audio_prompts,
uncond_audio_prompts=uncond_audio_prompts,
motion_exp=motion_exp,
motion_pose=motion_pose,
fps= torch.from_numpy(np.array(fps)),
denoise_strength=denoise_strength,
ip_cfg_scale=ip_cfg_scale,
freqs_cis=(freqs_cos, freqs_sin),
@ -825,9 +992,9 @@ class HunyuanVideoSampler(Inference):
callback = callback,
callback_steps = callback_steps,
)[0]
gen_time = time.time() - start_time
if samples == None:
return None
samples = samples.sub_(0.5).mul_(2).squeeze(0)
samples = samples.squeeze(0)
return samples

View File

@ -0,0 +1,220 @@
"""
This module provides the implementation of an Audio Projection Model, which is designed for
audio processing tasks. The model takes audio embeddings as input and outputs context tokens
that can be used for various downstream applications, such as audio analysis or synthesis.
The AudioProjModel class is based on the ModelMixin class from the diffusers library, which
provides a foundation for building custom models. This implementation includes multiple linear
layers with ReLU activation functions and a LayerNorm for normalization.
Key Features:
- Audio embedding input with flexible sequence length and block structure.
- Multiple linear layers for feature transformation.
- ReLU activation for non-linear transformation.
- LayerNorm for stabilizing and speeding up training.
- Rearrangement of input embeddings to match the model's expected input shape.
- Customizable number of blocks, channels, and context tokens for adaptability.
The module is structured to be easily integrated into larger systems or used as a standalone
component for audio feature extraction and processing.
Classes:
- AudioProjModel: A class representing the audio projection model with configurable parameters.
Functions:
- (none)
Dependencies:
- torch: For tensor operations and neural network components.
- diffusers: For the ModelMixin base class.
- einops: For tensor rearrangement operations.
"""
import torch
from diffusers import ModelMixin
from einops import rearrange
import math
import torch.nn as nn
class AudioProjNet2(ModelMixin):
"""Audio Projection Model
This class defines an audio projection model that takes audio embeddings as input
and produces context tokens as output. The model is based on the ModelMixin class
and consists of multiple linear layers and activation functions. It can be used
for various audio processing tasks.
Attributes:
seq_len (int): The length of the audio sequence.
blocks (int): The number of blocks in the audio projection model.
channels (int): The number of channels in the audio projection model.
intermediate_dim (int): The intermediate dimension of the model.
context_tokens (int): The number of context tokens in the output.
output_dim (int): The output dimension of the context tokens.
Methods:
__init__(self, seq_len=5, blocks=12, channels=768, intermediate_dim=512, context_tokens=32, output_dim=768):
Initializes the AudioProjModel with the given parameters.
forward(self, audio_embeds):
Defines the forward pass for the AudioProjModel.
Parameters:
audio_embeds (torch.Tensor): The input audio embeddings with shape (batch_size, video_length, blocks, channels).
Returns:
context_tokens (torch.Tensor): The output context tokens with shape (batch_size, video_length, context_tokens, output_dim).
"""
def __init__(
self,
seq_len=5,
blocks=12, # add a new parameter blocks
channels=768, # add a new parameter channels
intermediate_dim=512,
output_dim=768,
context_tokens=4,
):
super().__init__()
self.seq_len = seq_len
self.blocks = blocks
self.channels = channels
self.input_dim = (
seq_len * blocks * channels
)
self.intermediate_dim = intermediate_dim
self.context_tokens = context_tokens
self.output_dim = output_dim
# define multiple linear layers
self.proj1 = nn.Linear(self.input_dim, intermediate_dim)
self.proj2 = nn.Linear(intermediate_dim, intermediate_dim)
self.proj3 = nn.Linear(intermediate_dim, context_tokens * output_dim)
self.norm = nn.LayerNorm(output_dim)
def forward(self, audio_embeds):
video_length = audio_embeds.shape[1]
audio_embeds = rearrange(audio_embeds, "bz f w b c -> (bz f) w b c")
batch_size, window_size, blocks, channels = audio_embeds.shape
audio_embeds = audio_embeds.view(batch_size, window_size * blocks * channels)
audio_embeds = torch.relu(self.proj1(audio_embeds))
audio_embeds = torch.relu(self.proj2(audio_embeds))
context_tokens = self.proj3(audio_embeds).reshape(
batch_size, self.context_tokens, self.output_dim
)
context_tokens = self.norm(context_tokens)
out_all = rearrange(
context_tokens, "(bz f) m c -> bz f m c", f=video_length
)
return out_all
def reshape_tensor(x, heads):
bs, length, width = x.shape
# (bs, length, width) --> (bs, length, n_heads, dim_per_head)
x = x.view(bs, length, heads, -1)
# (bs, length, n_heads, dim_per_head) --> (bs, n_heads, length, dim_per_head)
x = x.transpose(1, 2)
# (bs, n_heads, length, dim_per_head)
x = x.reshape(bs, heads, length, -1)
return x
class PerceiverAttentionCA(nn.Module):
def __init__(self, *, dim=3072, dim_head=1024, heads=33):
super().__init__()
self.scale = dim_head ** -0.5
self.dim_head = dim_head
self.heads = heads
inner_dim = dim_head #* heads
self.norm1 = nn.LayerNorm(dim)
self.norm2 = nn.LayerNorm(dim)
self.to_q = nn.Linear(dim, inner_dim, bias=False)
self.to_kv = nn.Linear(dim, inner_dim * 2, bias=False)
self.to_out = nn.Linear(inner_dim, dim, bias=False)
import torch.nn.init as init
init.zeros_(self.to_out.weight)
if self.to_out.bias is not None:
init.zeros_(self.to_out.bias)
def forward(self, x, latents):
"""
Args:
x (torch.Tensor): image features
shape (b, t, aa, D)
latent (torch.Tensor): latent features
shape (b, t, hw, D)
"""
x = self.norm1(x)
latents = self.norm2(latents)
# print("latents shape: ", latents.shape)
# print("x shape: ", x.shape)
q = self.to_q(latents)
k, v = self.to_kv(x).chunk(2, dim=-1)
# attention
scale = 1 / math.sqrt(math.sqrt(self.dim_head))
weight = (q * scale) @ (k * scale).transpose(-2, -1) # More stable with f16 than dividing afterwards
weight = torch.softmax(weight.float(), dim=-1).type(weight.dtype)
out = weight @ v
# out = out.permute(0, 2, 1, 3)
return self.to_out(out)
#def forward(self, x, latents):
# """
# Args:
# x (torch.Tensor): image features
# shape (b, t, aa, D)
# latent (torch.Tensor): latent features
# shape (b, t, hw, D)
# """
# if get_sequence_parallel_state():
# sp_size = nccl_info.sp_size
# sp_rank = nccl_info.rank_within_group
# print("rank:", latents.shape, sp_size, sp_rank)
# latents = torch.chunk(latents, sp_size, dim=1)[sp_rank]
# x = self.norm1(x)
# latents = self.norm2(latents)
# # print("latents shape: ", latents.shape)
# # print("x shape: ", x.shape)
# q = self.to_q(latents)
# k, v = self.to_kv(x).chunk(2, dim=-1)
# # print("q, k, v: ", q.shape, k.shape, v.shape)
# # attention
# #scale = 1 / math.sqrt(math.sqrt(self.dim_head))
# #weight = (q * scale) @ (k * scale).transpose(-2, -1) # More stable with f16 than dividing afterwards
# #weight = torch.softmax(weight.float(), dim=-1).type(weight.dtype)
# #out = weight @ v
# def shrink_head(encoder_state, dim):
# local_heads = encoder_state.shape[dim] // nccl_info.sp_size
# return encoder_state.narrow(dim, nccl_info.rank_within_group * local_heads, local_heads)
# if get_sequence_parallel_state():
# # batch_size, seq_len, attn_heads, head_dim
# q = all_to_all_4D(q, scatter_dim=2, gather_dim=1) # [2, 32256, 24, 128]
# k = shrink_head(k ,dim=2)
# v = shrink_head(v ,dim=2)
# qkv = torch.stack([query, key, value], dim=2)
# attn = flash_attn_no_pad(qkv, causal=False, dropout_p=0.0, softmax_scale=None)
# # out = out.permute(0, 2, 1, 3)
# #b, s, a, d = attn.shape
# #attn = attn.reshape(b, s, -1)
#
# out = self.to_out(attn)
# if get_sequence_parallel_state():
# out = all_gather(out, dim=1)
# return out

View File

@ -53,10 +53,11 @@ class PatchEmbed(nn.Module):
def forward(self, x):
x = self.proj(x)
shape = x.shape
if self.flatten:
x = x.flatten(2).transpose(1, 2) # BCHW -> BNC
x = self.norm(x)
return x
return x, shape
class TextProjection(nn.Module):

View File

@ -19,6 +19,7 @@ from .token_refiner import SingleTokenRefiner
import numpy as np
from mmgp import offload
from wan.modules.attention import pay_attention
from .audio_adapters import AudioProjNet2, PerceiverAttentionCA
def get_linear_split_map():
hidden_size = 3072
@ -589,7 +590,8 @@ class HYVideoDiffusionTransformer(ModelMixin, ConfigMixin):
use_attention_mask: bool = True,
dtype: Optional[torch.dtype] = None,
device: Optional[torch.device] = None,
attention_mode: Optional[str] = "sdpa"
attention_mode: Optional[str] = "sdpa",
avatar = False,
):
factory_kwargs = {"device": device, "dtype": dtype}
super().__init__()
@ -708,6 +710,45 @@ class HYVideoDiffusionTransformer(ModelMixin, ConfigMixin):
get_activation_layer("silu"),
**factory_kwargs,
)
avatar_audio = avatar
if avatar_audio:
self.ref_in = PatchEmbed(
self.patch_size, self.in_channels, self.hidden_size, **factory_kwargs
)
# -------------------- audio_proj_model --------------------
self.audio_proj = AudioProjNet2(seq_len=10, blocks=5, channels=384, intermediate_dim=1024, output_dim=3072, context_tokens=4)
# -------------------- motion-embeder --------------------
self.motion_exp = TimestepEmbedder(
self.hidden_size // 4,
get_activation_layer("silu"),
**factory_kwargs
)
self.motion_pose = TimestepEmbedder(
self.hidden_size // 4,
get_activation_layer("silu"),
**factory_kwargs
)
self.fps_proj = TimestepEmbedder(
self.hidden_size,
get_activation_layer("silu"),
**factory_kwargs
)
self.before_proj = nn.Linear(self.hidden_size, self.hidden_size)
# -------------------- audio_insert_model --------------------
self.double_stream_list = [1, 3, 5, 7, 9, 11, 13, 15, 17, 19]
self.single_stream_list = []
self.double_stream_map = {str(i): j for j, i in enumerate(self.double_stream_list)}
self.single_stream_map = {str(i): j+len(self.double_stream_list) for j, i in enumerate(self.single_stream_list)}
self.audio_adapter_blocks = nn.ModuleList([
PerceiverAttentionCA(dim=3072, dim_head=1024, heads=33) for _ in range(len(self.double_stream_list) + len(self.single_stream_list))
])
def lock_layers_dtypes(self, dtype = torch.float32):
@ -750,11 +791,17 @@ class HYVideoDiffusionTransformer(ModelMixin, ConfigMixin):
guidance: torch.Tensor = None, # Guidance for modulation, should be cfg_scale x 1000.
pipeline=None,
x_id = 0,
step_no = 0,
callback = None,
audio_prompts = None,
motion_exp = None,
motion_pose = None,
fps = None,
face_mask = None,
) -> Union[torch.Tensor, Dict[str, torch.Tensor]]:
img = x
batch_no, _, ot, oh, ow = x.shape
bsz, _, ot, oh, ow = x.shape
del x
txt = text_states
tt, th, tw = (
@ -765,6 +812,17 @@ class HYVideoDiffusionTransformer(ModelMixin, ConfigMixin):
# Prepare modulation vectors.
vec = self.time_in(t)
if motion_exp != None:
vec += self.motion_exp(motion_exp.view(-1)).view(bsz, -1) # (b, 3072)
if motion_pose != None:
vec += self.motion_pose(motion_pose.view(-1)).view(bsz, -1) # (b, 3072)
if fps != None:
vec += self.fps_proj(fps) # (b, 3072)
if audio_prompts != None:
audio_feature_all = self.audio_proj(audio_prompts)
audio_feature_pad = audio_feature_all[:,:1].repeat(1,3,1,1)
audio_feature_all_insert = torch.cat([audio_feature_pad, audio_feature_all], dim=1).view(bsz, ot, 16, 3072)
audio_feature_all = None
if self.i2v_condition_type == "token_replace":
token_replace_t = torch.zeros_like(t)
@ -777,7 +835,6 @@ class HYVideoDiffusionTransformer(ModelMixin, ConfigMixin):
# token_replace_mask_txt = None
# text modulation
# vec = vec + self.vector_in(text_states_2)
vec_2 = self.vector_in(text_states_2)
del text_states_2
vec += vec_2
@ -793,12 +850,17 @@ class HYVideoDiffusionTransformer(ModelMixin, ConfigMixin):
)
# our timestep_embedding is merged into guidance_in(TimestepEmbedder)
vec = vec + self.guidance_in(guidance)
vec += self.guidance_in(guidance)
# Embed image and text.
img = self.img_in(img)
if ref_latents != None:
ref_latents = self.img_in(ref_latents)
img, shape_mask = self.img_in(img)
if audio_prompts != None:
ref_latents_first = ref_latents[:, :, :1].clone()
ref_latents,_ = self.ref_in(ref_latents)
ref_latents_first,_ = self.img_in(ref_latents_first)
elif ref_latents != None:
ref_latents, _ = self.img_in(ref_latents)
if self.text_projection == "linear":
txt = self.txt_in(txt)
elif self.text_projection == "single_refiner":
@ -808,7 +870,18 @@ class HYVideoDiffusionTransformer(ModelMixin, ConfigMixin):
f"Unsupported text_projection: {self.text_projection}"
)
if ref_latents == None:
if audio_prompts != None:
img += self.before_proj(ref_latents)
ref_length = ref_latents_first.shape[-2] # [b s c]
img = torch.cat([ref_latents_first, img], dim=-2) # t c
img_len = img.shape[1]
mask_len = img_len - ref_length
if face_mask.shape[2] == 1:
face_mask = face_mask.repeat(1,1,ot,1,1) # repeat if number of mask frame is 1
face_mask = torch.nn.functional.interpolate(face_mask, size=[ot, shape_mask[-2], shape_mask[-1]], mode="nearest")
# face_mask = face_mask.view(-1,mask_len,1).repeat(1,1,img.shape[-1]).type_as(img)
face_mask = face_mask.view(-1,mask_len,1).type_as(img)
elif ref_latents == None:
ref_length = None
else:
ref_length = ref_latents.shape[-2]
@ -828,13 +901,13 @@ class HYVideoDiffusionTransformer(ModelMixin, ConfigMixin):
if x_id == 0:
self.should_calc = True
inp = img[0:1]
vec_ = vec
vec_ = vec[0:1]
( img_mod1_shift, img_mod1_scale, _ , _ , _ , _ , ) = self.double_blocks[0].img_mod(vec_).chunk(6, dim=-1)
normed_inp = self.double_blocks[0].img_norm1(inp)
normed_inp = normed_inp.to(torch.bfloat16)
modulated_inp = modulate( normed_inp, shift=img_mod1_shift, scale=img_mod1_scale )
del normed_inp, img_mod1_shift, img_mod1_scale
if self.teacache_counter <= self.teacache_start_step or self.teacache_counter == self.num_steps-1:
if step_no <= self.teacache_start_step or step_no == self.num_steps-1:
self.accumulated_rel_l1_distance = 0
else:
coefficients = [7.33226126e+02, -4.01131952e+02, 6.75869174e+01, -3.14987800e+00, 9.61237896e-02]
@ -846,9 +919,6 @@ class HYVideoDiffusionTransformer(ModelMixin, ConfigMixin):
else:
self.accumulated_rel_l1_distance = 0
self.previous_modulated_input = modulated_inp
self.teacache_counter += 1
if self.teacache_counter == self.num_steps:
self.teacache_counter = 0
else:
self.should_calc = True
@ -859,7 +929,7 @@ class HYVideoDiffusionTransformer(ModelMixin, ConfigMixin):
self.previous_residual[x_id] = None
ori_img = img[0:1].clone()
# --------------------- Pass through DiT blocks ------------------------
for _, block in enumerate(self.double_blocks):
for layer_num, block in enumerate(self.double_blocks):
for i in range(len(img)):
if callback != None:
callback(-1, None, False, True)
@ -880,6 +950,16 @@ class HYVideoDiffusionTransformer(ModelMixin, ConfigMixin):
img[i], txt[i] = block(*double_block_args)
double_block_args = None
# insert audio feature to img
if audio_prompts != None:
audio_adapter = getattr(self.double_blocks[layer_num], "audio_adapter", None)
if audio_adapter != None:
real_img = img[i:i+1,ref_length:].view(1, ot, -1, 3072)
real_img = audio_adapter(audio_feature_all_insert[i:i+1], real_img).view(1, -1, 3072)
real_img *= face_mask[i:i+1]
img[i:i+1, ref_length:] += real_img
real_img = None
for _, block in enumerate(self.single_blocks):
for i in range(len(img)):
@ -932,6 +1012,7 @@ class HYVideoDiffusionTransformer(ModelMixin, ConfigMixin):
img = torch.cat(img_list)
img_list = None
# img = self.unpatchify(img, tt, th, tw)
img = self.unpatchify(img, tt, th, tw)
return img
@ -1015,5 +1096,14 @@ HUNYUAN_VIDEO_CONFIG = {
"heads_num": 24,
"mlp_width_ratio": 4,
},
'HYVideo-T/2-avatar': { # 9.0B / 12.5B
'mm_double_blocks_depth': 20,
'mm_single_blocks_depth': 40,
'rope_dim_list': [16, 56, 56],
'hidden_size': 3072,
'heads_num': 24,
'mlp_width_ratio': 4,
'avatar': True,
},
}

89
wgp.py
View File

@ -190,15 +190,19 @@ def process_prompt_and_add_tasks(state, model_choice):
if video_length > sliding_window_size:
gr.Info(f"The Number of Frames to generate ({video_length}) is greater than the Sliding Window Size ({sliding_window_size}) , multiple Windows will be generated")
if "phantom" in model_filename or "hunyuan_video_custom" in model_filename:
if "phantom" in model_filename or "hunyuan_video_custom" in model_filename or "hunyuan_video_avatar" in model_filename:
image_refs = inputs["image_refs"]
audio_guide = inputs["audio_guide"]
if image_refs == None :
gr.Info("You must provide an Image Reference")
return
if len(image_refs) > 1 and "hunyuan_video_custom" in model_filename:
gr.Info("Only one Image Reference (a person) is supported for the moment by Hunyuan Custom")
if len(image_refs) > 1 and ("hunyuan_video_custom" in model_filename or "hunyuan_video_avatar" in model_filename):
gr.Info("Only one Image Reference (a person) is supported for the moment by Hunyuan Custom / Avatar")
return
if audio_guide == None and "hunyuan_video_avatar" in model_filename:
gr.Info("You must provide an audio file")
return
if any(isinstance(image[0], str) for image in image_refs) :
gr.Info("Reference Image should be an Image")
return
@ -1539,7 +1543,9 @@ wan_choices_i2v=["ckpts/wan2.1_image2video_480p_14B_mbf16.safetensors", "ckpts/w
ltxv_choices= ["ckpts/ltxv_0.9.7_13B_dev_bf16.safetensors", "ckpts/ltxv_0.9.7_13B_dev_quanto_bf16_int8.safetensors", "ckpts/ltxv_0.9.7_13B_distilled_lora128_bf16.safetensors"]
hunyuan_choices= ["ckpts/hunyuan_video_720_bf16.safetensors", "ckpts/hunyuan_video_720_quanto_int8.safetensors", "ckpts/hunyuan_video_i2v_720_bf16v2.safetensors", "ckpts/hunyuan_video_i2v_720_quanto_int8v2.safetensors",
"ckpts/hunyuan_video_custom_720_bf16.safetensors", "ckpts/hunyuan_video_custom_720_quanto_bf16_int8.safetensors" ]
"ckpts/hunyuan_video_custom_720_bf16.safetensors", "ckpts/hunyuan_video_custom_720_quanto_bf16_int8.safetensors",
"ckpts/hunyuan_video_avatar_720_bf16.safetensors", "ckpts/hunyuan_video_avatar_720_quanto_bf16_int8.safetensors",
]
transformer_choices = wan_choices_t2v + wan_choices_i2v + ltxv_choices + hunyuan_choices
def get_dependent_models(model_filename, quantization, dtype_policy ):
@ -1549,12 +1555,13 @@ def get_dependent_models(model_filename, quantization, dtype_policy ):
return [get_model_filename("ltxv_13B", quantization, dtype_policy)]
else:
return []
model_types = [ "t2v_1.3B", "t2v", "i2v", "i2v_720p", "flf2v_720p", "vace_1.3B","vace_14B","moviigen", "phantom_1.3B", "phantom_14B", "fantasy", "fun_inp_1.3B", "fun_inp", "recam_1.3B", "sky_df_1.3B", "sky_df_14B", "sky_df_720p_14B", "ltxv_13B", "ltxv_13B_distilled", "hunyuan", "hunyuan_i2v", "hunyuan_custom"]
model_types = [ "t2v_1.3B", "t2v", "i2v", "i2v_720p", "flf2v_720p", "vace_1.3B","vace_14B","moviigen", "phantom_1.3B", "phantom_14B", "fantasy", "fun_inp_1.3B", "fun_inp", "recam_1.3B", "sky_df_1.3B", "sky_df_14B", "sky_df_720p_14B", "ltxv_13B", "ltxv_13B_distilled", "hunyuan", "hunyuan_i2v", "hunyuan_custom", "hunyuan_avatar"]
model_signatures = {"t2v": "text2video_14B", "t2v_1.3B" : "text2video_1.3B", "fun_inp_1.3B" : "Fun_InP_1.3B", "fun_inp" : "Fun_InP_14B",
"i2v" : "image2video_480p", "i2v_720p" : "image2video_720p" , "vace_1.3B" : "Vace_1.3B", "vace_14B" : "Vace_14B","recam_1.3B": "recammaster_1.3B",
"flf2v_720p" : "FLF2V_720p", "sky_df_1.3B" : "sky_reels2_diffusion_forcing_1.3B", "sky_df_14B" : "sky_reels2_diffusion_forcing_14B",
"sky_df_720p_14B" : "sky_reels2_diffusion_forcing_720p_14B", "moviigen" :"moviigen",
"phantom_1.3B" : "phantom_1.3B", "phantom_14B" : "phantom_14B", "fantasy" : "fantasy", "ltxv_13B" : "ltxv_0.9.7_13B_dev", "ltxv_13B_distilled" : "ltxv_0.9.7_13B_distilled", "hunyuan" : "hunyuan_video_720", "hunyuan_i2v" : "hunyuan_video_i2v_720", "hunyuan_custom" : "hunyuan_video_custom" }
"phantom_1.3B" : "phantom_1.3B", "phantom_14B" : "phantom_14B", "fantasy" : "fantasy", "ltxv_13B" : "ltxv_0.9.7_13B_dev", "ltxv_13B_distilled" : "ltxv_0.9.7_13B_distilled",
"hunyuan" : "hunyuan_video_720", "hunyuan_i2v" : "hunyuan_video_i2v_720", "hunyuan_custom" : "hunyuan_video_custom", "hunyuan_avatar" : "hunyuan_video_avatar" }
def get_model_type(model_filename):
@ -1639,7 +1646,10 @@ def get_model_name(model_filename, description_container = [""]):
description = "A good looking image 2 video model, but not so good in prompt adherence."
elif "hunyuan_video_custom" in model_filename:
model_name = "Hunyuan Video Custom 720p 13B"
description = "The Hunyuan Video Custom model is proably the best model to transfer people (only people for the momment) as it is quite good to keep their identity. However it is slow as to get good results, you need to generate 720p videos with 30 steps."
description = "The Hunyuan Video Custom model is probably the best model to transfer people (only people for the momment) as it is quite good to keep their identity. However it is slow as to get good results, you need to generate 720p videos with 30 steps."
elif "hunyuan_video_avatar" in model_filename:
model_name = "Hunyuan Video Avatar 720p 13B"
description = "With the Hunyuan Video Avatar model you can animate a person based on the content of an audio input. Please note that the video generator works by processing 128 frames segment at a time (even if you ask less). The good news is that it will concatenate multiple segments for long video generation (max 3 segments recommended as the quality will get worse)."
else:
model_name = "Wan2.1 text2video"
model_name += " 14B" if "14B" in model_filename else " 1.3B"
@ -1758,7 +1768,14 @@ def get_default_settings(filename):
ui_defaults.update({
"guidance_scale": 7.5,
"flow_shift": 13,
"resolution": "1280x720"
"resolution": "1280x720",
})
elif get_model_type(filename) in ("hunyuan_avatar"):
ui_defaults.update({
"guidance_scale": 7.5,
"flow_shift": 5,
"tea_cache_start_step_perc": 25,
"video_length": 129,
})
elif get_model_type(filename) in ("vace_14B"):
ui_defaults.update({
@ -1954,8 +1971,13 @@ def download_models(transformer_filename):
text_encoder_filename = get_hunyuan_text_encoder_filename(text_encoder_quantization)
model_def = {
"repoId" : "DeepBeepMeep/HunyuanVideo",
"sourceFolderList" : [ "llava-llama-3-8b", "clip_vit_large_patch14", "" ],
"fileList" :[ ["config.json", "special_tokens_map.json", "tokenizer.json", "tokenizer_config.json", "preprocessor_config.json"] + computeList(text_encoder_filename) , ["config.json", "merges.txt", "model.safetensors", "preprocessor_config.json", "special_tokens_map.json", "tokenizer.json", "tokenizer_config.json", "vocab.json"], [ "hunyuan_video_720_quanto_int8_map.json", "hunyuan_video_custom_VAE_fp32.safetensors", "hunyuan_video_custom_VAE_config.json", "hunyuan_video_VAE_fp32.safetensors", "hunyuan_video_VAE_config.json" , "hunyuan_video_720_quanto_int8_map.json" ] + computeList(transformer_filename) ]
"sourceFolderList" : [ "llava-llama-3-8b", "clip_vit_large_patch14", "whisper-tiny" , "det_align", "" ],
"fileList" :[ ["config.json", "special_tokens_map.json", "tokenizer.json", "tokenizer_config.json", "preprocessor_config.json"] + computeList(text_encoder_filename) ,
["config.json", "merges.txt", "model.safetensors", "preprocessor_config.json", "special_tokens_map.json", "tokenizer.json", "tokenizer_config.json", "vocab.json"],
["config.json", "model.safetensors", "preprocessor_config.json", "special_tokens_map.json", "tokenizer_config.json"],
["detface.pt"],
[ "hunyuan_video_720_quanto_int8_map.json", "hunyuan_video_custom_VAE_fp32.safetensors", "hunyuan_video_custom_VAE_config.json", "hunyuan_video_VAE_fp32.safetensors", "hunyuan_video_VAE_config.json" , "hunyuan_video_720_quanto_int8_map.json" ] + computeList(transformer_filename)
]
}
else:
@ -2122,6 +2144,14 @@ def load_hunyuan_model(model_filename, quantizeTransformer = False, dtype = torc
pipe = { "transformer" : hunyuan_model.model, "text_encoder" : hunyuan_model.text_encoder, "text_encoder_2" : hunyuan_model.text_encoder_2, "vae" : hunyuan_model.vae }
if hunyuan_model.wav2vec != None:
pipe["wav2vec"] = hunyuan_model.wav2vec
# if hunyuan_model.align_instance != None:
# pipe["align_instance"] = hunyuan_model.align_instance.facedet.model
from hyvideo.modules.models import get_linear_split_map
split_linear_modules_map = get_linear_split_map()
@ -2818,9 +2848,13 @@ def generate_video(
hunyuan_t2v = "hunyuan_video_720" in model_filename
hunyuan_i2v = "hunyuan_video_i2v" in model_filename
hunyuan_custom = "hunyuan_video_custom" in model_filename
hunyuan_avatar = "hunyuan_video_avatar" in model_filename
fantasy = "fantasy" in model_filename
if diffusion_forcing or hunyuan_t2v or hunyuan_i2v or hunyuan_custom:
fps = 24
elif audio_guide != None:
elif hunyuan_avatar:
fps = 25
elif fantasy:
fps = 23
elif ltxv:
fps = 30
@ -2829,11 +2863,13 @@ def generate_video(
latent_size = 8 if ltxv else 4
original_image_refs = image_refs
if image_refs != None and len(image_refs) > 0 and (hunyuan_custom or phantom or vace):
send_cmd("progress", [0, get_latest_status(state, "Removing Images References Background")])
if image_refs != None and len(image_refs) > 0 and (hunyuan_custom or phantom or hunyuan_avatar or vace):
if hunyuan_avatar: remove_background_images_ref = 0
if remove_background_images_ref > 0:
send_cmd("progress", [0, get_latest_status(state, "Removing Images References Background")])
os.environ["U2NET_HOME"] = os.path.join(os.getcwd(), "ckpts", "rembg")
from wan.utils.utils import resize_and_remove_background
image_refs = resize_and_remove_background(image_refs, width, height, remove_background_images_ref, fit_into_canvas= not vace)
image_refs = resize_and_remove_background(image_refs, width, height, remove_background_images_ref, fit_into_canvas= not (vace or hunyuan_avatar) ) # no fit for vace ref images as it is done later
update_task_thumbnails(task, locals())
send_cmd("output")
@ -2866,13 +2902,14 @@ def generate_video(
audio_proj_split = None
audio_scale = None
audio_context_lens = None
if audio_guide != None:
if (fantasy or hunyuan_avatar) and audio_guide != None:
from fantasytalking.infer import parse_audio
import librosa
duration = librosa.get_duration(path=audio_guide)
current_video_length = min(int(fps * duration // 4) * 4 + 5, current_video_length)
audio_proj_split, audio_context_lens = parse_audio(audio_guide, num_frames= current_video_length, fps= fps, device= processing_device )
audio_scale = 1.0
current_video_length = min(int(fps * duration // 4) * 4 + 5, current_video_length)
if fantasy:
audio_proj_split, audio_context_lens = parse_audio(audio_guide, num_frames= current_video_length, fps= fps, device= processing_device )
audio_scale = 1.0
import random
if seed == None or seed <0:
@ -2990,7 +3027,7 @@ def generate_video(
if reuse_frames > 0:
return_latent_slice = slice(-(reuse_frames - 1 + discard_last_frames ) // latent_size, None if discard_last_frames == 0 else -(discard_last_frames // latent_size) )
if hunyuan_custom:
if hunyuan_custom or hunyuan_avatar:
src_ref_images = image_refs
elif phantom:
src_ref_images = image_refs.copy() if image_refs != None else None
@ -3098,6 +3135,7 @@ def generate_video(
cfg_star_switch = cfg_star_switch,
cfg_zero_step = cfg_zero_step,
audio_cfg_scale= audio_guidance_scale,
audio_guide=audio_guide,
audio_proj= audio_proj_split,
audio_scale= audio_scale,
audio_context_lens= audio_context_lens,
@ -4502,6 +4540,7 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non
hunyuan_t2v = "hunyuan_video_720" in model_filename
hunyuan_i2v = "hunyuan_video_i2v" in model_filename
hunyuan_video_custom = "hunyuan_video_custom" in model_filename
hunyuan_video_avatar = "hunyuan_video_avatar" in model_filename
sliding_window_enabled = vace or diffusion_forcing or ltxv
new_line_text = "each new line of prompt will be used for a window" if sliding_window_enabled else "each new line of prompt will generate a new video"
@ -4575,7 +4614,7 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non
model_mode = gr.Dropdown(value=None, visible=False)
keep_frames_video_source = gr.Text(visible=False)
with gr.Column(visible= vace or phantom or hunyuan_video_custom) as video_prompt_column:
with gr.Column(visible= vace or phantom or hunyuan_video_custom or hunyuan_video_avatar) as video_prompt_column:
video_prompt_type_value= ui_defaults.get("video_prompt_type","")
video_prompt_type = gr.Text(value= video_prompt_type_value, visible= False)
with gr.Row():
@ -4624,14 +4663,14 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non
("Keep it for first Image (landscape) and remove it for other Images (objects / faces)", 2),
],
value=ui_defaults.get("remove_background_images_ref",1),
label="Remove Background of Images References", scale = 3, visible= "I" in video_prompt_type_value
label="Remove Background of Images References", scale = 3, visible= "I" in video_prompt_type_value and not hunyuan_video_avatar
)
# remove_background_images_ref = gr.Checkbox(value=ui_defaults.get("remove_background_images_ref",1), label= "Remove Background of Images References", visible= "I" in video_prompt_type_value, scale =1 )
video_mask = gr.Video(label= "Video Mask (for Inpainting or Outpaing, white pixels = Mask)", visible= "M" in video_prompt_type_value, value= ui_defaults.get("video_mask", None))
audio_guide = gr.Audio(value= ui_defaults.get("audio_guide", None), type="filepath", label="Voice to follow", show_download_button= True, visible= fantasy )
audio_guide = gr.Audio(value= ui_defaults.get("audio_guide", None), type="filepath", label="Voice to follow", show_download_button= True, visible= fantasy or hunyuan_video_avatar )
advanced_prompt = advanced_ui
prompt_vars=[]
@ -4720,6 +4759,8 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non
video_length = gr.Slider(17, 737, value=ui_defaults.get("video_length", 81), step=4, label="Number of frames (16 = 1s)", interactive= True)
elif fantasy:
video_length = gr.Slider(5, 233, value=ui_defaults.get("video_length", 81), step=4, label="Number of frames (23 = 1s)", interactive= True)
elif hunyuan_video_avatar:
video_length = gr.Slider(5, 401, value=ui_defaults.get("video_length", 81), step=4, label="Number of frames (25 = 1s)", interactive= True)
elif hunyuan_t2v or hunyuan_i2v:
video_length = gr.Slider(5, 337, value=ui_defaults.get("video_length", 97), step=4, label="Number of frames (24 = 1s)", interactive= True)
else:
@ -4809,7 +4850,7 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non
)
with gr.Tab("Quality", visible = not ltxv) as quality_tab:
with gr.Column(visible = not (hunyuan_i2v or hunyuan_t2v or hunyuan_video_custom) ) as skip_layer_guidance_row:
with gr.Column(visible = not (hunyuan_i2v or hunyuan_t2v or hunyuan_video_custom or hunyuan_video_avatar) ) as skip_layer_guidance_row:
gr.Markdown("<B>Skip Layer Guidance (improves video quality)</B>")
with gr.Row():
slg_switch = gr.Dropdown(