Support for Hunyuan Video Avatar

2026-01-11 16:53:34 +00:00 · 2025-06-05 15:35:58 +02:00 · 2025-06-05 15:35:58 +02:00 · 1976868f6a
commit 1976868f6a
parent 7670af9610
15 changed files with 2562 additions and 81 deletions
--- a/README.md
+++ b/README.md
@ -21,7 +21,7 @@ WanGP supports the Wan (and derived models), Hunyuan Video and LTV Video models


 ## 🔥 Latest News!!
-* May 26 2025: 👋 WanGP v5.31 : Added Phantom 14B, a model that you can use to transfer objects / people in the video. My preference goes to Vace that remains the king of controlnets.
+* May 28 2025: 👋 WanGP v5.31 : Added Phantom 14B, a model that you can use to transfer objects / people in the video. My preference goes to Vace that remains the king of controlnets.
 * May 26 2025: 👋 WanGP v5.3 : Happy with a Video generation and want to do more generations using the same settings but you can't remember what you did or you find it to hard to copy / paste one per one each setting from the file metadata ? Rejoice ! There are now multiple ways to turn this tedious process into a one click task: 
    - Select one Video recently generated in the Video Gallery and click *Use Selected Video Settings*
    - Click *Drop File Here*  and select a Video you saved somewhere, if the settings metadata have been saved with the Video you will be able to extract them automatically
--- a/hyvideo/data_kits/audio_dataset.py
+++ b/hyvideo/data_kits/audio_dataset.py
@ -0,0 +1,170 @@
+import os
+import cv2
+import math
+import json
+import torch
+import random
+import librosa
+import traceback
+import torchvision
+import numpy as np
+import pandas as pd
+from PIL import Image
+from einops import rearrange
+from torch.utils.data import Dataset
+from decord import VideoReader, cpu
+from transformers import CLIPImageProcessor
+import torchvision.transforms as transforms
+from torchvision.transforms import ToPILImage
+
+
+
+def get_audio_feature(feature_extractor, audio_path):
+    audio_input, sampling_rate = librosa.load(audio_path, sr=16000)
+    assert sampling_rate == 16000
+
+    audio_features = []
+    window = 750*640
+    for i in range(0, len(audio_input), window):
+        audio_feature = feature_extractor(audio_input[i:i+window], 
+                                        sampling_rate=sampling_rate, 
+                                        return_tensors="pt", 
+                                        ).input_features
+        audio_features.append(audio_feature)
+
+    audio_features = torch.cat(audio_features, dim=-1)
+    return audio_features, len(audio_input) // 640
+
+
+class VideoAudioTextLoaderVal(Dataset):
+    def __init__(
+        self, 
+        image_size: int,
+        meta_file: str, 
+        **kwargs,
+    ):
+        super().__init__()
+        self.meta_file = meta_file
+        self.image_size = image_size
+        self.text_encoder = kwargs.get("text_encoder", None)            # llava_text_encoder
+        self.text_encoder_2 = kwargs.get("text_encoder_2", None)        # clipL_text_encoder
+        self.feature_extractor = kwargs.get("feature_extractor", None)
+        self.meta_files = []
+    
+        csv_data = pd.read_csv(meta_file)
+        for idx in range(len(csv_data)):
+            self.meta_files.append(
+                {
+                    "videoid": str(csv_data["videoid"][idx]),
+                    "image_path": str(csv_data["image"][idx]), 
+                    "audio_path": str(csv_data["audio"][idx]), 
+                    "prompt": str(csv_data["prompt"][idx]), 
+                    "fps": float(csv_data["fps"][idx])
+                }
+            )
+        
+        self.llava_transform = transforms.Compose(
+            [
+                transforms.Resize((336, 336), interpolation=transforms.InterpolationMode.BILINEAR), 
+                transforms.ToTensor(), 
+                transforms.Normalize((0.48145466, 0.4578275, 0.4082107), (0.26862954, 0.26130258, 0.27577711)),
+            ]
+        )
+        self.clip_image_processor = CLIPImageProcessor()
+        
+        self.device = torch.device("cuda")
+        self.weight_dtype = torch.float16
+
+        
+    def __len__(self):
+        return len(self.meta_files)
+
+    @staticmethod
+    def get_text_tokens(text_encoder, description, dtype_encode="video"):
+        text_inputs = text_encoder.text2tokens(description, data_type=dtype_encode)
+        text_ids = text_inputs["input_ids"].squeeze(0)
+        text_mask = text_inputs["attention_mask"].squeeze(0)
+        return text_ids, text_mask
+    
+    def get_batch_data(self, idx):
+        meta_file = self.meta_files[idx]
+        videoid = meta_file["videoid"]
+        image_path = meta_file["image_path"]
+        audio_path = meta_file["audio_path"]
+        prompt = "Authentic, Realistic, Natural, High-quality, Lens-Fixed, " + meta_file["prompt"]
+        fps = meta_file["fps"]
+        
+        img_size = self.image_size
+        ref_image = Image.open(image_path).convert('RGB')
+        
+        # Resize reference image
+        w, h = ref_image.size
+        scale = img_size / min(w, h)
+        new_w = round(w * scale / 64) * 64
+        new_h = round(h * scale / 64) * 64
+
+        if img_size == 704:
+            img_size_long = 1216
+        if new_w * new_h > img_size * img_size_long:
+            import math
+            scale = math.sqrt(img_size * img_size_long / w / h)
+            new_w = round(w * scale / 64) * 64
+            new_h = round(h * scale / 64) * 64
+
+        ref_image = ref_image.resize((new_w, new_h), Image.LANCZOS)
+        
+        ref_image = np.array(ref_image)
+        ref_image = torch.from_numpy(ref_image)
+         
+        audio_input, audio_len = get_audio_feature(self.feature_extractor, audio_path)
+        audio_prompts = audio_input[0]
+        
+        motion_bucket_id_heads = np.array([25] * 4)
+        motion_bucket_id_exps = np.array([30] * 4)
+        motion_bucket_id_heads = torch.from_numpy(motion_bucket_id_heads)
+        motion_bucket_id_exps = torch.from_numpy(motion_bucket_id_exps)
+        fps = torch.from_numpy(np.array(fps))
+        
+        to_pil = ToPILImage()
+        pixel_value_ref = rearrange(ref_image.clone().unsqueeze(0), "b h w c -> b c h w")   # (b c h w)
+        
+        pixel_value_ref_llava = [self.llava_transform(to_pil(image)) for image in pixel_value_ref]
+        pixel_value_ref_llava = torch.stack(pixel_value_ref_llava, dim=0)
+        pixel_value_ref_clip = self.clip_image_processor(
+            images=Image.fromarray((pixel_value_ref[0].permute(1,2,0)).data.cpu().numpy().astype(np.uint8)), 
+            return_tensors="pt"
+        ).pixel_values[0]
+        pixel_value_ref_clip = pixel_value_ref_clip.unsqueeze(0)
+        
+        # Encode text prompts
+   
+        text_ids, text_mask = self.get_text_tokens(self.text_encoder, prompt)
+        text_ids_2, text_mask_2 = self.get_text_tokens(self.text_encoder_2, prompt)
+        
+        # Output batch
+        batch = {
+            "text_prompt": prompt,                         # 
+            "videoid": videoid, 
+            "pixel_value_ref": pixel_value_ref.to(dtype=torch.float16),                 # 参考图，用于vae提特征 (1, 3, h, w), 取值范围(0, 255)
+            "pixel_value_ref_llava": pixel_value_ref_llava.to(dtype=torch.float16),     # 参考图，用于llava提特征 (1, 3, 336, 336), 取值范围 = CLIP取值范围
+            "pixel_value_ref_clip": pixel_value_ref_clip.to(dtype=torch.float16),       # 参考图，用于clip_image_encoder提特征 (1, 3, 244, 244), 取值范围 = CLIP取值范围
+            "audio_prompts": audio_prompts.to(dtype=torch.float16), 
+            "motion_bucket_id_heads": motion_bucket_id_heads.to(dtype=text_ids.dtype), 
+            "motion_bucket_id_exps": motion_bucket_id_exps.to(dtype=text_ids.dtype), 
+            "fps": fps.to(dtype=torch.float16), 
+            "text_ids": text_ids.clone(),                                               # 对应llava_text_encoder
+            "text_mask": text_mask.clone(),                                             # 对应llava_text_encoder
+            "text_ids_2": text_ids_2.clone(),                                           # 对应clip_text_encoder
+            "text_mask_2": text_mask_2.clone(),                                         # 对应clip_text_encoder
+            "audio_len": audio_len,
+            "image_path": image_path, 
+            "audio_path": audio_path, 
+        }
+        return batch
+    
+    def __getitem__(self, idx):
+        return self.get_batch_data(idx)
+    
+
+        
+        
--- a/hyvideo/data_kits/audio_preprocessor.py
+++ b/hyvideo/data_kits/audio_preprocessor.py
@ -0,0 +1,72 @@
+
+import os
+import cv2
+import json
+import time
+import decord
+import einops
+import librosa
+import torch
+import random
+import argparse
+import traceback
+import numpy as np
+from tqdm import tqdm
+from PIL import Image
+from einops import rearrange
+
+
+
+def get_facemask(ref_image, align_instance, area=1.25):
+    # ref_image: (b f c h w)
+    bsz, f, c, h, w = ref_image.shape
+    images = rearrange(ref_image, "b f c h w -> (b f) h w c").data.cpu().numpy().astype(np.uint8)
+    face_masks = []
+    for image in images:
+        image_pil = Image.fromarray(image).convert("RGB")
+        _, _, bboxes_list = align_instance(np.array(image_pil)[:,:,[2,1,0]], maxface=True)
+        try:
+            bboxSrc = bboxes_list[0]
+        except:
+            bboxSrc = [0, 0, w, h]
+        x1, y1, ww, hh = bboxSrc
+        x2, y2 = x1 + ww, y1 + hh
+        ww, hh = (x2-x1) * area, (y2-y1) * area
+        center = [(x2+x1)//2, (y2+y1)//2]
+        x1 = max(center[0] - ww//2, 0)
+        y1 = max(center[1] - hh//2, 0)
+        x2 = min(center[0] + ww//2, w)
+        y2 = min(center[1] + hh//2, h)
+        
+        face_mask = np.zeros_like(np.array(image_pil))
+        face_mask[int(y1):int(y2), int(x1):int(x2)] = 1.0
+        face_masks.append(torch.from_numpy(face_mask[...,:1]))
+    face_masks = torch.stack(face_masks, dim=0)     # (b*f, h, w, c)
+    face_masks = rearrange(face_masks, "(b f) h w c -> b c f h w", b=bsz, f=f)
+    face_masks = face_masks.to(device=ref_image.device, dtype=ref_image.dtype)
+    return face_masks
+
+
+def encode_audio(wav2vec, audio_feats, fps, num_frames=129):
+    if fps == 25:
+        start_ts = [0]
+        step_ts = [1]
+    elif fps == 12.5:
+        start_ts = [0]
+        step_ts = [2]
+    num_frames = min(num_frames, 400)
+    audio_feats = wav2vec.encoder(audio_feats.unsqueeze(0)[:, :, :3000], output_hidden_states=True).hidden_states
+    audio_feats = torch.stack(audio_feats, dim=2)
+    audio_feats = torch.cat([torch.zeros_like(audio_feats[:,:4]), audio_feats], 1)
+    
+    audio_prompts = []
+    for bb in range(1):
+        audio_feats_list = []
+        for f in range(num_frames):
+            cur_t = (start_ts[bb] + f * step_ts[bb]) * 2
+            audio_clip = audio_feats[bb:bb+1, cur_t: cur_t+10]
+            audio_feats_list.append(audio_clip)
+        audio_feats_list = torch.stack(audio_feats_list, 1)
+        audio_prompts.append(audio_feats_list)
+    audio_prompts = torch.cat(audio_prompts)
+    return audio_prompts
--- a/hyvideo/data_kits/data_tools.py
+++ b/hyvideo/data_kits/data_tools.py
@ -0,0 +1,41 @@
+import os
+import cv2
+import torch
+import numpy as np
+import imageio
+import torchvision
+from einops import rearrange
+
+
+def save_videos_grid(videos: torch.Tensor, path: str, rescale=False, n_rows=6, fps=8, quality=8):
+    videos = rearrange(videos, "b c t h w -> t b c h w")
+    outputs = []
+    for x in videos:
+        x = torchvision.utils.make_grid(x, nrow=n_rows)
+        x = x.transpose(0, 1).transpose(1, 2).squeeze(-1)
+        if rescale:
+            x = (x + 1.0) / 2.0  # -1,1 -> 0,1
+        x = torch.clamp(x,0,1)
+        x = (x * 255).numpy().astype(np.uint8)
+        outputs.append(x)
+
+    os.makedirs(os.path.dirname(path), exist_ok=True)
+    imageio.mimsave(path, outputs, fps=fps, quality=quality)
+
+def pad_image(crop_img, size, color=(255, 255, 255), resize_ratio=1):
+    crop_h, crop_w = crop_img.shape[:2]
+    target_w, target_h = size
+    scale_h, scale_w = target_h / crop_h, target_w / crop_w
+    if scale_w > scale_h:
+        resize_h = int(target_h*resize_ratio)
+        resize_w = int(crop_w / crop_h * resize_h)
+    else:
+        resize_w = int(target_w*resize_ratio)
+        resize_h = int(crop_h / crop_w * resize_w)
+    crop_img = cv2.resize(crop_img, (resize_w, resize_h))
+    pad_left = (target_w - resize_w) // 2
+    pad_top = (target_h - resize_h) // 2
+    pad_right = target_w - resize_w - pad_left
+    pad_bottom = target_h - resize_h - pad_top
+    crop_img = cv2.copyMakeBorder(crop_img, pad_top, pad_bottom, pad_left, pad_right, cv2.BORDER_CONSTANT, value=color)
+    return crop_img
--- a/hyvideo/data_kits/face_align/init.py
+++ b/hyvideo/data_kits/face_align/init.py
@ -0,0 +1 @@
+from .align import AlignImage
--- a/hyvideo/data_kits/face_align/align.py
+++ b/hyvideo/data_kits/face_align/align.py
@ -0,0 +1,34 @@
+import os
+import sys
+import torch
+from .detface import DetFace
+
+class AlignImage(object):
+    def __init__(self, device='cuda', det_path=''):
+        self.facedet = DetFace(pt_path=det_path, confThreshold=0.5, nmsThreshold=0.45, device=device)
+
+    @torch.no_grad()
+    def __call__(self, im, maxface=False):
+        bboxes, kpss, scores = self.facedet.detect(im)
+        face_num = bboxes.shape[0]
+
+        five_pts_list = []
+        scores_list = []
+        bboxes_list = []
+        for i in range(face_num):
+            five_pts_list.append(kpss[i].reshape(5,2))
+            scores_list.append(scores[i])
+            bboxes_list.append(bboxes[i])
+
+        if maxface and face_num>1:
+            max_idx = 0
+            max_area = (bboxes[0, 2])*(bboxes[0, 3])
+            for i in range(1, face_num):
+                area = (bboxes[i,2])*(bboxes[i,3])
+                if area>max_area:
+                    max_idx = i
+            five_pts_list = [five_pts_list[max_idx]]
+            scores_list = [scores_list[max_idx]]
+            bboxes_list = [bboxes_list[max_idx]]
+
+        return five_pts_list, scores_list, bboxes_list
--- a/hyvideo/data_kits/face_align/detface.py
+++ b/hyvideo/data_kits/face_align/detface.py
@ -0,0 +1,283 @@
+# -*- coding: UTF-8 -*-
+import os
+import cv2
+import numpy as np
+import torch
+import torchvision
+
+
+def xyxy2xywh(x):
+    # Convert nx4 boxes from [x1, y1, x2, y2] to [x, y, w, h] where xy1=top-left, xy2=bottom-right
+    y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x)
+    y[:, 0] = (x[:, 0] + x[:, 2]) / 2  # x center
+    y[:, 1] = (x[:, 1] + x[:, 3]) / 2  # y center
+    y[:, 2] = x[:, 2] - x[:, 0]  # width
+    y[:, 3] = x[:, 3] - x[:, 1]  # height
+    return y
+
+
+def xywh2xyxy(x):
+    # Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right
+    y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x)
+    y[:, 0] = x[:, 0] - x[:, 2] / 2  # top left x
+    y[:, 1] = x[:, 1] - x[:, 3] / 2  # top left y
+    y[:, 2] = x[:, 0] + x[:, 2] / 2  # bottom right x
+    y[:, 3] = x[:, 1] + x[:, 3] / 2  # bottom right y
+    return y
+
+
+def box_iou(box1, box2):
+    # https://github.com/pytorch/vision/blob/master/torchvision/ops/boxes.py
+    """
+    Return intersection-over-union (Jaccard index) of boxes.
+    Both sets of boxes are expected to be in (x1, y1, x2, y2) format.
+    Arguments:
+        box1 (Tensor[N, 4])
+        box2 (Tensor[M, 4])
+    Returns:
+        iou (Tensor[N, M]): the NxM matrix containing the pairwise
+            IoU values for every element in boxes1 and boxes2
+    """
+
+    def box_area(box):
+        # box = 4xn
+        return (box[2] - box[0]) * (box[3] - box[1])
+
+    area1 = box_area(box1.T)
+    area2 = box_area(box2.T)
+
+    # inter(N,M) = (rb(N,M,2) - lt(N,M,2)).clamp(0).prod(2)
+    inter = (torch.min(box1[:, None, 2:], box2[:, 2:]) -
+             torch.max(box1[:, None, :2], box2[:, :2])).clamp(0).prod(2)
+    # iou = inter / (area1 + area2 - inter)
+    return inter / (area1[:, None] + area2 - inter)
+
+
+def scale_coords(img1_shape, coords, img0_shape, ratio_pad=None):
+    # Rescale coords (xyxy) from img1_shape to img0_shape
+    if ratio_pad is None:  # calculate from img0_shape
+        gain = min(img1_shape[0] / img0_shape[0], img1_shape[1] / img0_shape[1])  # gain  = old / new
+        pad = (img1_shape[1] - img0_shape[1] * gain) / 2, (img1_shape[0] - img0_shape[0] * gain) / 2  # wh padding
+    else:
+        gain = ratio_pad[0][0]
+        pad = ratio_pad[1]
+
+    coords[:, [0, 2]] -= pad[0]  # x padding
+    coords[:, [1, 3]] -= pad[1]  # y padding
+    coords[:, :4] /= gain
+    clip_coords(coords, img0_shape)
+    return coords
+
+
+def clip_coords(boxes, img_shape):
+    # Clip bounding xyxy bounding boxes to image shape (height, width)
+    boxes[:, 0].clamp_(0, img_shape[1])  # x1
+    boxes[:, 1].clamp_(0, img_shape[0])  # y1
+    boxes[:, 2].clamp_(0, img_shape[1])  # x2
+    boxes[:, 3].clamp_(0, img_shape[0])  # y2
+
+
+def scale_coords_landmarks(img1_shape, coords, img0_shape, ratio_pad=None):
+    # Rescale coords (xyxy) from img1_shape to img0_shape
+    if ratio_pad is None:  # calculate from img0_shape
+        gain = min(img1_shape[0] / img0_shape[0], img1_shape[1] / img0_shape[1])  # gain  = old / new
+        pad = (img1_shape[1] - img0_shape[1] * gain) / 2, (img1_shape[0] - img0_shape[0] * gain) / 2  # wh padding
+    else:
+        gain = ratio_pad[0][0]
+        pad = ratio_pad[1]
+
+    coords[:, [0, 2, 4, 6, 8]] -= pad[0]  # x padding
+    coords[:, [1, 3, 5, 7, 9]] -= pad[1]  # y padding
+    coords[:, :10] /= gain
+    #clip_coords(coords, img0_shape)
+    coords[:, 0].clamp_(0, img0_shape[1])  # x1
+    coords[:, 1].clamp_(0, img0_shape[0])  # y1
+    coords[:, 2].clamp_(0, img0_shape[1])  # x2
+    coords[:, 3].clamp_(0, img0_shape[0])  # y2
+    coords[:, 4].clamp_(0, img0_shape[1])  # x3
+    coords[:, 5].clamp_(0, img0_shape[0])  # y3
+    coords[:, 6].clamp_(0, img0_shape[1])  # x4
+    coords[:, 7].clamp_(0, img0_shape[0])  # y4
+    coords[:, 8].clamp_(0, img0_shape[1])  # x5
+    coords[:, 9].clamp_(0, img0_shape[0])  # y5
+    return coords
+
+
+def show_results(img, xywh, conf, landmarks, class_num):
+    h,w,c = img.shape
+    tl = 1 or round(0.002 * (h + w) / 2) + 1  # line/font thickness
+    x1 = int(xywh[0] * w - 0.5 * xywh[2] * w)
+    y1 = int(xywh[1] * h - 0.5 * xywh[3] * h)
+    x2 = int(xywh[0] * w + 0.5 * xywh[2] * w)
+    y2 = int(xywh[1] * h + 0.5 * xywh[3] * h)
+    cv2.rectangle(img, (x1,y1), (x2, y2), (0,255,0), thickness=tl, lineType=cv2.LINE_AA)
+
+    clors = [(255,0,0),(0,255,0),(0,0,255),(255,255,0),(0,255,255)]
+
+    for i in range(5):
+        point_x = int(landmarks[2 * i] * w)
+        point_y = int(landmarks[2 * i + 1] * h)
+        cv2.circle(img, (point_x, point_y), tl+1, clors[i], -1)
+
+    tf = max(tl - 1, 1)  # font thickness
+    label = str(conf)[:5]
+    cv2.putText(img, label, (x1, y1 - 2), 0, tl / 3, [225, 255, 255], thickness=tf, lineType=cv2.LINE_AA)
+    return img
+
+
+def make_divisible(x, divisor):
+    # Returns x evenly divisible by divisor
+    return (x // divisor) * divisor
+
+
+def non_max_suppression_face(prediction, conf_thres=0.5, iou_thres=0.45, classes=None, agnostic=False, labels=()):
+    """Performs Non-Maximum Suppression (NMS) on inference results
+    Returns:
+         detections with shape: nx6 (x1, y1, x2, y2, conf, cls)
+    """
+
+    nc = prediction.shape[2] - 15  # number of classes
+    xc = prediction[..., 4] > conf_thres  # candidates
+
+    # Settings
+    min_wh, max_wh = 2, 4096  # (pixels) minimum and maximum box width and height
+    # time_limit = 10.0  # seconds to quit after
+    redundant = True  # require redundant detections
+    multi_label = nc > 1  # multiple labels per box (adds 0.5ms/img)
+    merge = False  # use merge-NMS
+
+    # t = time.time()
+    output = [torch.zeros((0, 16), device=prediction.device)] * prediction.shape[0]
+    for xi, x in enumerate(prediction):  # image index, image inference
+        # Apply constraints
+        # x[((x[..., 2:4] < min_wh) | (x[..., 2:4] > max_wh)).any(1), 4] = 0  # width-height
+        x = x[xc[xi]]  # confidence
+
+        # Cat apriori labels if autolabelling
+        if labels and len(labels[xi]):
+            l = labels[xi]
+            v = torch.zeros((len(l), nc + 15), device=x.device)
+            v[:, :4] = l[:, 1:5]  # box
+            v[:, 4] = 1.0  # conf
+            v[range(len(l)), l[:, 0].long() + 15] = 1.0  # cls
+            x = torch.cat((x, v), 0)
+
+        # If none remain process next image
+        if not x.shape[0]:
+            continue
+
+        # Compute conf
+        x[:, 15:] *= x[:, 4:5]  # conf = obj_conf * cls_conf
+
+        # Box (center x, center y, width, height) to (x1, y1, x2, y2)
+        box = xywh2xyxy(x[:, :4])
+
+        # Detections matrix nx6 (xyxy, conf, landmarks, cls)
+        if multi_label:
+            i, j = (x[:, 15:] > conf_thres).nonzero(as_tuple=False).T
+            x = torch.cat((box[i], x[i, j + 15, None], x[i, 5:15] ,j[:, None].float()), 1)
+        else:  # best class only
+            conf, j = x[:, 15:].max(1, keepdim=True)
+            x = torch.cat((box, conf, x[:, 5:15], j.float()), 1)[conf.view(-1) > conf_thres]
+
+        # Filter by class
+        if classes is not None:
+            x = x[(x[:, 5:6] == torch.tensor(classes, device=x.device)).any(1)]
+
+        # If none remain process next image
+        n = x.shape[0]  # number of boxes
+        if not n:
+            continue
+
+        # Batched NMS
+        c = x[:, 15:16] * (0 if agnostic else max_wh)  # classes
+        boxes, scores = x[:, :4] + c, x[:, 4]  # boxes (offset by class), scores
+        i = torchvision.ops.nms(boxes, scores, iou_thres)  # NMS
+        #if i.shape[0] > max_det:  # limit detections
+        #    i = i[:max_det]
+        if merge and (1 < n < 3E3):  # Merge NMS (boxes merged using weighted mean)
+            # update boxes as boxes(i,4) = weights(i,n) * boxes(n,4)
+            iou = box_iou(boxes[i], boxes) > iou_thres  # iou matrix
+            weights = iou * scores[None]  # box weights
+            x[i, :4] = torch.mm(weights, x[:, :4]).float() / weights.sum(1, keepdim=True)  # merged boxes
+            if redundant:
+                i = i[iou.sum(1) > 1]  # require redundancy
+
+        output[xi] = x[i]
+        # if (time.time() - t) > time_limit:
+        #     break  # time limit exceeded
+
+    return output
+
+
+class DetFace():
+    def __init__(self, pt_path, confThreshold=0.5, nmsThreshold=0.45, device='cuda'):
+        assert os.path.exists(pt_path)
+
+        self.inpSize = 416
+        self.conf_thres = confThreshold
+        self.iou_thres = nmsThreshold
+        self.test_device = torch.device(device if torch.cuda.is_available() else "cpu")
+        self.model = torch.jit.load(pt_path).to(self.test_device)
+        self.last_w = 416
+        self.last_h = 416
+        self.grids = None
+
+    @torch.no_grad()
+    def detect(self, srcimg):
+        # t0=time.time()
+
+        h0, w0 = srcimg.shape[:2]  # orig hw
+        r = self.inpSize / min(h0, w0)  # resize image to img_size
+        h1 = int(h0*r+31)//32*32
+        w1 = int(w0*r+31)//32*32
+
+        img = cv2.resize(srcimg, (w1,h1), interpolation=cv2.INTER_LINEAR)
+
+        # Convert
+        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) # BGR to RGB
+
+        # Run inference
+        img = torch.from_numpy(img).to(self.test_device).permute(2,0,1)
+        img = img.float()/255  # uint8 to fp16/32  0-1
+        if img.ndimension() == 3:
+            img = img.unsqueeze(0)
+
+        # Inference
+        if h1 != self.last_h or w1 != self.last_w or self.grids is None:
+            grids = []
+            for scale in [8,16,32]:
+                ny = h1//scale
+                nx = w1//scale
+                yv, xv = torch.meshgrid([torch.arange(ny), torch.arange(nx)])
+                grid = torch.stack((xv, yv), 2).view((1,1,ny, nx, 2)).float()
+                grids.append(grid.to(self.test_device))
+            self.grids = grids
+            self.last_w = w1
+            self.last_h = h1
+
+        pred = self.model(img, self.grids).cpu()
+
+        # Apply NMS
+        det = non_max_suppression_face(pred, self.conf_thres, self.iou_thres)[0]
+        # Process detections
+        # det = pred[0]
+        bboxes = np.zeros((det.shape[0], 4))
+        kpss = np.zeros((det.shape[0], 5, 2))
+        scores = np.zeros((det.shape[0]))
+        # gn = torch.tensor([w0, h0, w0, h0]).to(pred)  # normalization gain whwh
+        # gn_lks = torch.tensor([w0, h0, w0, h0, w0, h0, w0, h0, w0, h0]).to(pred)  # normalization gain landmarks
+        det = det.cpu().numpy()
+
+        for j in range(det.shape[0]):
+            # xywh = (xyxy2xywh(det[j, :4].view(1, 4)) / gn).view(4).cpu().numpy()
+            bboxes[j, 0] = det[j, 0] * w0/w1
+            bboxes[j, 1] = det[j, 1] * h0/h1
+            bboxes[j, 2] = det[j, 2] * w0/w1 - bboxes[j, 0]
+            bboxes[j, 3] = det[j, 3] * h0/h1 - bboxes[j, 1]
+            scores[j] = det[j, 4]
+            # landmarks = (det[j, 5:15].view(1, 10) / gn_lks).view(5,2).cpu().numpy()
+            kpss[j, :, :] = det[j, 5:15].reshape(5, 2) * np.array([[w0/w1,h0/h1]])
+                # class_num = det[j, 15].cpu().numpy()
+                # orgimg = show_results(orgimg, xywh, conf, landmarks, class_num)
+        return bboxes, kpss, scores
--- a/hyvideo/diffusion/pipelines/init.py
+++ b/hyvideo/diffusion/pipelines/init.py
@ -1 +1,2 @@
 from .pipeline_hunyuan_video import HunyuanVideoPipeline
+from .pipeline_hunyuan_video_audio import HunyuanVideoAudioPipeline
--- a/hyvideo/diffusion/pipelines/pipeline_hunyuan_video.py
+++ b/hyvideo/diffusion/pipelines/pipeline_hunyuan_video.py
@ -1142,7 +1142,7 @@ class HunyuanVideoPipeline(DiffusionPipeline):

        target_dtype = PRECISION_TO_TYPE[precision]
        autocast_enabled = target_dtype != torch.float32 and not disable_autocast
-        vae_dtype = PRECISION_TO_TYPE[vae_precision]
+        vae_dtype = self.vae._model_dtype # PRECISION_TO_TYPE[vae_precision]
        vae_autocast_enabled = vae_dtype != torch.float32 and not disable_autocast

        # 7. Denoising loop
@ -1262,6 +1262,7 @@ class HunyuanVideoPipeline(DiffusionPipeline):
                                guidance=guidance_expand,
                                pipeline=self,
                                x_id=j,
+                                step_no=i,
                                callback = callback,
                            )
                            if self._interrupt:
@ -1290,6 +1291,7 @@ class HunyuanVideoPipeline(DiffusionPipeline):
                            freqs_sin=freqs_cis[1],  # [seqlen, head_dim]
                            guidance=guidance_expand,
                            pipeline=self,
+                            step_no=i,
                            callback = callback,
                        )
                        if self._interrupt:
@ -1404,7 +1406,6 @@ class HunyuanVideoPipeline(DiffusionPipeline):
        else:
            image = latents

-        image = (image / 2 + 0.5).clamp(0, 1)
        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloa16
        image = image.cpu().float()

--- a/hyvideo/diffusion/pipelines/pipeline_hunyuan_video_audio.py
+++ b/hyvideo/diffusion/pipelines/pipeline_hunyuan_video_audio.py
--- a/hyvideo/hunyuan.py
+++ b/hyvideo/hunyuan.py
@ -5,7 +5,7 @@ import functools
 from typing import List, Optional, Tuple, Union

 from pathlib import Path
-
+from einops import rearrange
 import torch
 import torch.distributed as dist
 from hyvideo.constants import PROMPT_TEMPLATE, NEGATIVE_PROMPT, PRECISION_TO_TYPE, NEGATIVE_PROMPT_I2V
@ -16,11 +16,34 @@ from hyvideo.utils.data_utils import align_to, get_closest_ratio, generate_crop_
 from hyvideo.modules.posemb_layers import get_nd_rotary_pos_embed, get_nd_rotary_pos_embed_new 
 from hyvideo.diffusion.schedulers import FlowMatchDiscreteScheduler
 from hyvideo.diffusion.pipelines import HunyuanVideoPipeline
+from hyvideo.diffusion.pipelines import HunyuanVideoAudioPipeline
 from PIL import Image
 import numpy as np
 import torchvision.transforms as transforms
 import cv2
 from wan.utils.utils import resize_lanczos, calculate_new_dimensions
+from hyvideo.data_kits.audio_preprocessor import encode_audio, get_facemask
+from transformers import WhisperModel
+from transformers import AutoFeatureExtractor
+from hyvideo.data_kits.face_align import AlignImage
+import librosa
+
+def get_audio_feature(feature_extractor, audio_path, duration):
+    audio_input, sampling_rate = librosa.load(audio_path, duration=duration, sr=16000)
+    assert sampling_rate == 16000
+
+    audio_features = []
+    window = 750*640
+    for i in range(0, len(audio_input), window):
+        audio_feature = feature_extractor(audio_input[i:i+window], 
+                                        sampling_rate=sampling_rate, 
+                                        return_tensors="pt", 
+                                        device="cuda"
+                                        ).input_features
+        audio_features.append(audio_feature)
+
+    audio_features = torch.cat(audio_features, dim=-1)
+    return audio_features, len(audio_input) // 640

 def pad_image(crop_img, size, color=(255, 255, 255), resize_ratio=1):
    crop_h, crop_w = crop_img.shape[:2]
@ -212,6 +235,14 @@ def patched_llava_forward(
        image_hidden_states=image_features if pixel_values is not None else None,
    )

+def adapt_avatar_model(model):
+    modules_dict= { k: m for k, m in model.named_modules()}
+    for model_layer, avatar_layer in model.double_stream_map.items():
+        module = modules_dict[f"audio_adapter_blocks.{avatar_layer}"]
+        target = modules_dict[f"double_blocks.{model_layer}"]
+        setattr(target, "audio_adapter", module )
+    delattr(model, "audio_adapter_blocks")
+
 class DataPreprocess(object):
    def __init__(self):
        self.llava_size = (336, 336)
@ -223,12 +254,18 @@ class DataPreprocess(object):
            ]
        )

-    def get_batch(self, image , size):
+    def get_batch(self, image , size, pad = False):
        image = np.asarray(image)
-        llava_item_image = pad_image(image.copy(), self.llava_size)
+        if pad:
+            llava_item_image = pad_image(image.copy(), self.llava_size)
+        else:
+            llava_item_image = image.copy()
        uncond_llava_item_image = np.ones_like(llava_item_image) * 255
-        cat_item_image = pad_image(image.copy(), size)

+        if pad:
+            cat_item_image = pad_image(image.copy(), size)
+        else:
+            cat_item_image = image.copy()
        llava_item_tensor = self.llava_transform(Image.fromarray(llava_item_image.astype(np.uint8)))
        uncond_llava_item_tensor = self.llava_transform(Image.fromarray(uncond_llava_item_image))
        cat_item_tensor = torch.from_numpy(cat_item_image.copy()).permute((2, 0, 1)) / 255.0
@ -243,6 +280,8 @@ class Inference(object):
    def __init__(        
        self,
        i2v,
+        custom,
+        avatar,
        enable_cfg,
        vae,
        vae_kwargs,
@ -250,9 +289,14 @@ class Inference(object):
        model,
        text_encoder_2=None,
        pipeline=None,
+        feature_extractor=None,
+        wav2vec=None,
+        align_instance=None,
        device=None,
    ):
        self.i2v = i2v
+        self.custom = custom
+        self.avatar = avatar
        self.enable_cfg = enable_cfg
        self.vae = vae
        self.vae_kwargs = vae_kwargs
@ -263,8 +307,11 @@ class Inference(object):
        self.model = model
        self.pipeline = pipeline

-        self.device = "cuda"
+        self.feature_extractor=feature_extractor
+        self.wav2vec=wav2vec
+        self.align_instance=align_instance

+        self.device = "cuda"


    @classmethod
@ -285,15 +332,21 @@ class Inference(object):
        i2v_condition_type = None
        i2v_mode = "i2v" in model_filepath[0]
        custom = False
+        avatar = False 
        if i2v_mode:
            model_id = "HYVideo-T/2"
            i2v_condition_type = "token_replace"
        elif "custom" in model_filepath[0]:
            model_id = "HYVideo-T/2-custom"
            custom = True
+        elif "avatar" in model_filepath[0]:
+            model_id = "HYVideo-T/2-avatar"
+            text_len = 256
+            avatar = True
        else:
            model_id = "HYVideo-T/2-cfgdistill"

+
        if i2v_mode and i2v_condition_type == "latent_concat":
            in_channels = latent_channels * 2 + 1
            image_embed_interleave = 2
@ -323,11 +376,11 @@ class Inference(object):
        from mmgp import offload
        # model = Inference.load_state_dict(args, model, model_filepath)

-        # model_filepath ="c:/temp/hc/mp_rank_00_model_states.pt"
+        # model_filepath ="c:/temp/avatar/mp_rank_00_model_states.pt"
        offload.load_model_data(model, model_filepath, pinToMemory = pinToMemory, partialPinning = partialPinning)
        pass
-        # offload.save_model(model, "hunyuan_video_custom_720_bf16.safetensors")
-        # offload.save_model(model, "hunyuan_video_custom_720_quanto_bf16_int8.safetensors", do_quantize= True)
+        # offload.save_model(model, "hunyuan_video_avatar_720_bf16.safetensors")
+        # offload.save_model(model, "hunyuan_video_avatar_720_quanto_bf16_int8.safetensors", do_quantize= True)

        model.mixed_precision = mixed_precision_transformer

@ -338,9 +391,12 @@ class Inference(object):

        # ============================= Build extra models ========================
        # VAE
-        if custom:
+        if custom or avatar:
            vae_configpath = "ckpts/hunyuan_video_custom_VAE_config.json"
            vae_filepath = "ckpts/hunyuan_video_custom_VAE_fp32.safetensors"
+        # elif avatar:
+        #     vae_configpath = "ckpts/config_vae_avatar.json"
+        #     vae_filepath = "ckpts/vae_avatar.pt"
        else:
            vae_configpath = "ckpts/hunyuan_video_VAE_config.json"
            vae_filepath = "ckpts/hunyuan_video_VAE_fp32.safetensors"
@ -350,6 +406,7 @@ class Inference(object):

        vae, _, s_ratio, t_ratio = load_vae( "884-16c-hy", vae_path= vae_filepath, vae_config_path= vae_configpath, vae_precision= vae_precision, device= "cpu", )

+        vae._model_dtype =  torch.float32 if VAE_dtype == torch.float32 else  (torch.float16 if avatar else torch.bfloat16)
        vae._model_dtype =  torch.float32 if VAE_dtype == torch.float32 else  torch.bfloat16
        vae_kwargs = {"s_ratio": s_ratio, "t_ratio": t_ratio}
        enable_cfg = False
@ -359,7 +416,7 @@ class Inference(object):
            tokenizer = "llm-i2v"
            prompt_template = "dit-llm-encode-i2v"
            prompt_template_video = "dit-llm-encode-video-i2v"
-        elif custom :
+        elif custom or avatar :
            text_encoder = "llm-i2v"
            tokenizer = "llm-i2v"
            prompt_template = "dit-llm-encode"
@ -411,14 +468,33 @@ class Inference(object):
            device="cpu",
        )

+        feature_extractor = None
+        wav2vec = None
+        align_instance = None
+
+        if avatar:
+            feature_extractor = AutoFeatureExtractor.from_pretrained("ckpts/whisper-tiny/")
+            wav2vec = WhisperModel.from_pretrained("ckpts/whisper-tiny/").to(device="cpu", dtype=torch.float32)
+            wav2vec._model_dtype = torch.float32
+            wav2vec.requires_grad_(False)
+            align_instance = AlignImage("cuda", det_path="ckpts/det_align/detface.pt")
+            align_instance.facedet.model.to("cpu")
+
+            adapt_avatar_model(model)
+
        return cls(
            i2v=i2v_mode,
+            custom=custom,
+            avatar=avatar,
            enable_cfg = enable_cfg,
            vae=vae,
            vae_kwargs=vae_kwargs,
            text_encoder=text_encoder,
            text_encoder_2=text_encoder_2,
            model=model,
+            feature_extractor=feature_extractor,
+            wav2vec=wav2vec,
+            align_instance=align_instance,
            device=device,
        )

@ -428,6 +504,8 @@ class HunyuanVideoSampler(Inference):
    def __init__(
        self,
        i2v,
+        custom,
+        avatar,
        enable_cfg,
        vae,
        vae_kwargs,
@ -435,10 +513,15 @@ class HunyuanVideoSampler(Inference):
        model,
        text_encoder_2=None,
        pipeline=None,
+        feature_extractor=None,
+        wav2vec=None,
+        align_instance=None,
        device=0,
    ):
        super().__init__(
            i2v,
+            custom,
+            avatar,
            enable_cfg,
            vae,
            vae_kwargs,
@ -446,12 +529,16 @@ class HunyuanVideoSampler(Inference):
            model,
            text_encoder_2=text_encoder_2,
            pipeline=pipeline,
+            feature_extractor=feature_extractor,
+            wav2vec=wav2vec,
+            align_instance=align_instance,
            device=device,
        )

        self.i2v_mode = i2v
        self.enable_cfg = enable_cfg
        self.pipeline = self.load_diffusion_pipeline(
+            avatar = self.avatar,
            vae=self.vae,
            text_encoder=self.text_encoder,
            text_encoder_2=self.text_encoder_2,
@ -474,6 +561,7 @@ class HunyuanVideoSampler(Inference):

    def load_diffusion_pipeline(
        self,
+        avatar,
        vae,
        text_encoder,
        text_encoder_2,
@ -491,14 +579,24 @@ class HunyuanVideoSampler(Inference):
                solver="euler",
            )

-        pipeline = HunyuanVideoPipeline(
-            vae=vae,
-            text_encoder=text_encoder,
-            text_encoder_2=text_encoder_2,
-            transformer=model,
-            scheduler=scheduler,
-            progress_bar_config=progress_bar_config,
-        )
+        if avatar:
+            pipeline = HunyuanVideoAudioPipeline(
+                vae=vae,
+                text_encoder=text_encoder,
+                text_encoder_2=text_encoder_2,
+                transformer=model,
+                scheduler=scheduler,
+                progress_bar_config=progress_bar_config,
+            )
+        else:
+            pipeline = HunyuanVideoPipeline(
+                vae=vae,
+                text_encoder=text_encoder,
+                text_encoder_2=text_encoder_2,
+                transformer=model,
+                scheduler=scheduler,
+                progress_bar_config=progress_bar_config,
+            )
 
        return pipeline

@ -588,6 +686,8 @@ class HunyuanVideoSampler(Inference):
        self,
        input_prompt,
        input_ref_images = None,
+        audio_guide = None,
+        fps = 24,
        height=192,
        width=336,
        frame_num=129,
@ -617,14 +717,12 @@ class HunyuanVideoSampler(Inference):
            self.vae.tile_sample_min_size = VAE_tile_size["tile_sample_min_size"]
            self.vae.tile_latent_min_size = VAE_tile_size["tile_latent_min_size"]
            self.vae.tile_overlap_factor = VAE_tile_size["tile_overlap_factor"]
+            self.vae.enable_tiling()

        i2v_mode= self.i2v_mode
        if not self.enable_cfg:
            guide_scale=1.0

-
-        out_dict = dict()
-
        # ========================================================================
        # Arguments: seed
        # ========================================================================
@ -663,7 +761,6 @@ class HunyuanVideoSampler(Inference):
        seed_everything(seed)
        generator = [torch.Generator("cuda").manual_seed(seed) for seed in seeds]
        # generator = [torch.Generator(self.device).manual_seed(seed) for seed in seeds]
-        out_dict["seeds"] = seeds

        # ========================================================================
        # Arguments: target_width, target_height, target_frame_num
@ -681,8 +778,6 @@ class HunyuanVideoSampler(Inference):
        target_width = align_to(width, 16)
        target_frame_num = frame_num

-        out_dict["size"] = (target_height, target_width, target_frame_num)
-
        if input_ref_images  != None:
            # ip_cfg_scale = 3.0
            ip_cfg_scale = 0
@ -769,28 +864,91 @@ class HunyuanVideoSampler(Inference):
        if input_ref_images == None:
            freqs_cos, freqs_sin = self.get_rotary_pos_embed(target_frame_num, target_height, target_width, enable_riflex)
        else:
-            concat_dict = {'mode': 'timecat-w', 'bias': -1} 
-            freqs_cos, freqs_sin = self.get_rotary_pos_embed_new(target_frame_num, target_height, target_width, concat_dict)
+            if self.avatar:
+                w, h = input_ref_images.size
+                target_height, target_width = calculate_new_dimensions(target_height, target_width, h, w, fit_into_canvas)
+                concat_dict = {'mode': 'timecat', 'bias': -1} 
+                freqs_cos, freqs_sin = self.get_rotary_pos_embed_new(129, target_height, target_width, concat_dict)
+            else:
+                concat_dict = {'mode': 'timecat-w', 'bias': -1} 
+                freqs_cos, freqs_sin = self.get_rotary_pos_embed_new(target_frame_num, target_height, target_width, concat_dict)

        n_tokens = freqs_cos.shape[0]

-
        callback = kwargs.pop("callback", None)
        callback_steps = kwargs.pop("callback_steps", None)
        # ========================================================================
        # Pipeline inference
        # ========================================================================
-        start_time = time.time()

-
-        #     "pixel_value_llava": llava_item_tensor.unsqueeze(0),
-        #     "uncond_pixel_value_llava": uncond_llava_item_tensor.unsqueeze(0),
-        #     'pixel_value_ref': cat_item_tensor.unsqueeze(0), 
+        pixel_value_llava, uncond_pixel_value_llava, pixel_value_ref =  None, None, None
        if input_ref_images  == None:
-            pixel_value_llava, uncond_pixel_value_llava, pixel_value_ref = None, None, None
            name = None
        else:
-            pixel_value_llava, uncond_pixel_value_llava, pixel_value_ref =  DataPreprocess().get_batch(input_ref_images, (target_width, target_height))
+            pixel_value_llava, uncond_pixel_value_llava, pixel_value_ref =  DataPreprocess().get_batch(input_ref_images, (target_width, target_height), pad = self.custom)
+
+        ref_latents, uncond_audio_prompts, audio_prompts, face_masks, motion_exp, motion_pose = None, None, None, None, None, None
+
+        if audio_guide != None:
+            if n_prompt == None or len(n_prompt) == 0:
+                n_prompt = "Aerial view, aerial view, overexposed, low quality, deformation, a poor composition, bad hands, bad teeth, bad eyes, bad limbs, distortion, blurring, Lens changes"
+
+            uncond_pixel_value_llava = pixel_value_llava.clone()
+
+            pixel_value_ref = pixel_value_ref.unsqueeze(0)
+            self.align_instance.facedet.model.to("cuda")
+            face_masks = get_facemask(pixel_value_ref.to("cuda")*255, self.align_instance, area=3.0) 
+            # iii = (face_masks.squeeze(0).squeeze(0).permute(1,2,0).repeat(1,1,3)*255).cpu().numpy().astype(np.uint8)
+            # image = Image.fromarray(iii)
+            # image.save("mask.png")
+            # jjj = (pixel_value_ref.squeeze(0).squeeze(0).permute(1,2,0)*255).cpu().numpy().astype(np.uint8)
+
+            self.align_instance.facedet.model.to("cpu")
+            # pixel_value_ref = pixel_value_ref.clone().repeat(1,129,1,1,1)
+
+            pixel_value_ref = pixel_value_ref.repeat(1,1+4*2,1,1,1)
+            pixel_value_ref = pixel_value_ref * 2 - 1 
+            pixel_value_ref_for_vae = rearrange(pixel_value_ref, "b f c h w -> b c f h w")
+
+            vae_dtype = self.vae.dtype
+            with torch.autocast(device_type="cuda", dtype=vae_dtype, enabled=vae_dtype != torch.float32):
+                ref_latents = self.vae.encode(pixel_value_ref_for_vae).latent_dist.sample()
+                ref_latents = torch.cat( [ref_latents[:,:, :1], ref_latents[:,:, 1:2].repeat(1,1,31,1,1),  ref_latents[:,:, -1:]], dim=2)
+                pixel_value_ref, pixel_value_ref_for_vae = None, None
+
+                if hasattr(self.vae.config, 'shift_factor') and self.vae.config.shift_factor:
+                    ref_latents.sub_(self.vae.config.shift_factor).mul_(self.vae.config.scaling_factor)
+                else:
+                    ref_latents.mul_(self.vae.config.scaling_factor)
+
+                # out_latents= ref_latents / self.vae.config.scaling_factor
+                # image = self.vae.decode(out_latents, return_dict=False, generator=generator)[0]
+                # image = image.clamp(-1, 1)
+                # from wan.utils.utils import cache_video
+                # cache_video( tensor=image, save_file="decode.mp4", fps=25, nrow=1, normalize=True, value_range=(-1, 1))
+
+
+            face_masks = torch.nn.functional.interpolate(face_masks.float().squeeze(2), 
+                                                    (ref_latents.shape[-2], 
+                                                    ref_latents.shape[-1]), 
+                                                    mode="bilinear").unsqueeze(2).to(dtype=ref_latents.dtype)
+            
+            audio_input, audio_len = get_audio_feature(self.feature_extractor, audio_guide, duration = frame_num/fps )
+            audio_prompts = audio_input[0]
+            weight_dtype = audio_prompts.dtype
+            
+            motion_pose = np.array([25] * 4)
+            motion_exp = np.array([30] * 4)
+            motion_pose = torch.from_numpy(motion_pose).unsqueeze(0)
+            motion_exp = torch.from_numpy(motion_exp).unsqueeze(0)
+            audio_prompts = encode_audio(self.wav2vec, audio_prompts.to(dtype=self.wav2vec.dtype), fps, num_frames=audio_len) 
+            audio_prompts = audio_prompts.to(self.model.dtype)
+            if audio_prompts.shape[1] <= 129:
+                audio_prompts = torch.cat([audio_prompts, torch.zeros_like(audio_prompts[:, :1]).repeat(1,129-audio_prompts.shape[1], 1, 1, 1)], dim=1)
+            else:
+                audio_prompts = torch.cat([audio_prompts, torch.zeros_like(audio_prompts[:, :1]).repeat(1, 5, 1, 1, 1)], dim=1)
+            uncond_audio_prompts = torch.zeros_like(audio_prompts[:,:129])
+            # target_frame_num = min(target_frame_num, audio_len)
        samples = self.pipeline(
            prompt=input_prompt,
            height=target_height,
@ -803,9 +961,18 @@ class HunyuanVideoSampler(Inference):
            generator=generator,
            output_type="pil",
            name = name,
-            pixel_value_llava = pixel_value_llava, 
-            uncond_pixel_value_llava=uncond_pixel_value_llava, 
-            pixel_value_ref=pixel_value_ref,
+
+            pixel_value_ref = pixel_value_ref,
+            ref_latents=ref_latents,                            # [1, 16, 1, h//8, w//8]
+            pixel_value_llava=pixel_value_llava,                # [1, 3, 336, 336]
+            uncond_pixel_value_llava=uncond_pixel_value_llava,
+            face_masks=face_masks,                              # [b f h w]
+            audio_prompts=audio_prompts, 
+            uncond_audio_prompts=uncond_audio_prompts, 
+            motion_exp=motion_exp, 
+            motion_pose=motion_pose, 
+            fps= torch.from_numpy(np.array(fps)), 
+
            denoise_strength=denoise_strength,
            ip_cfg_scale=ip_cfg_scale,             
            freqs_cis=(freqs_cos, freqs_sin),
@ -825,9 +992,9 @@ class HunyuanVideoSampler(Inference):
            callback = callback,
            callback_steps = callback_steps,
        )[0]
-        gen_time = time.time() - start_time
+
        if samples == None:
            return None
-        samples = samples.sub_(0.5).mul_(2).squeeze(0)
+        samples = samples.squeeze(0)

        return samples
--- a/hyvideo/modules/audio_adapters.py
+++ b/hyvideo/modules/audio_adapters.py
@ -0,0 +1,220 @@
+"""
+This module provides the implementation of an Audio Projection Model, which is designed for
+audio processing tasks. The model takes audio embeddings as input and outputs context tokens
+that can be used for various downstream applications, such as audio analysis or synthesis.
+
+The AudioProjModel class is based on the ModelMixin class from the diffusers library, which
+provides a foundation for building custom models. This implementation includes multiple linear
+layers with ReLU activation functions and a LayerNorm for normalization.
+
+Key Features:
+- Audio embedding input with flexible sequence length and block structure.
+- Multiple linear layers for feature transformation.
+- ReLU activation for non-linear transformation.
+- LayerNorm for stabilizing and speeding up training.
+- Rearrangement of input embeddings to match the model's expected input shape.
+- Customizable number of blocks, channels, and context tokens for adaptability.
+
+The module is structured to be easily integrated into larger systems or used as a standalone
+component for audio feature extraction and processing.
+
+Classes:
+- AudioProjModel: A class representing the audio projection model with configurable parameters.
+
+Functions:
+- (none)
+
+Dependencies:
+- torch: For tensor operations and neural network components.
+- diffusers: For the ModelMixin base class.
+- einops: For tensor rearrangement operations.
+
+"""
+
+import torch
+from diffusers import ModelMixin
+from einops import rearrange
+
+import math
+import torch.nn as nn
+
+class AudioProjNet2(ModelMixin):
+    """Audio Projection Model
+
+    This class defines an audio projection model that takes audio embeddings as input
+    and produces context tokens as output. The model is based on the ModelMixin class
+    and consists of multiple linear layers and activation functions. It can be used
+    for various audio processing tasks.
+
+    Attributes:
+        seq_len (int): The length of the audio sequence.
+        blocks (int): The number of blocks in the audio projection model.
+        channels (int): The number of channels in the audio projection model.
+        intermediate_dim (int): The intermediate dimension of the model.
+        context_tokens (int): The number of context tokens in the output.
+        output_dim (int): The output dimension of the context tokens.
+
+    Methods:
+        __init__(self, seq_len=5, blocks=12, channels=768, intermediate_dim=512, context_tokens=32, output_dim=768):
+            Initializes the AudioProjModel with the given parameters.
+        forward(self, audio_embeds):
+            Defines the forward pass for the AudioProjModel.
+            Parameters:
+            audio_embeds (torch.Tensor): The input audio embeddings with shape (batch_size, video_length, blocks, channels).
+            Returns:
+            context_tokens (torch.Tensor): The output context tokens with shape (batch_size, video_length, context_tokens, output_dim).
+
+    """
+
+    def __init__(
+        self,
+        seq_len=5,
+        blocks=12,  # add a new parameter blocks
+        channels=768,  # add a new parameter channels
+        intermediate_dim=512,
+        output_dim=768,
+        context_tokens=4,
+    ):
+        super().__init__()
+
+        self.seq_len = seq_len
+        self.blocks = blocks
+        self.channels = channels
+        self.input_dim = (
+            seq_len * blocks * channels
+        )  
+        self.intermediate_dim = intermediate_dim
+        self.context_tokens = context_tokens
+        self.output_dim = output_dim
+
+        # define multiple linear layers
+        self.proj1 = nn.Linear(self.input_dim, intermediate_dim)
+        self.proj2 = nn.Linear(intermediate_dim, intermediate_dim)
+        self.proj3 = nn.Linear(intermediate_dim, context_tokens * output_dim)
+
+        self.norm = nn.LayerNorm(output_dim)
+
+
+    def forward(self, audio_embeds):
+          
+        video_length = audio_embeds.shape[1]
+        audio_embeds = rearrange(audio_embeds, "bz f w b c -> (bz f) w b c")
+        batch_size, window_size, blocks, channels = audio_embeds.shape
+        audio_embeds = audio_embeds.view(batch_size, window_size * blocks * channels)
+
+        audio_embeds = torch.relu(self.proj1(audio_embeds))
+        audio_embeds = torch.relu(self.proj2(audio_embeds))
+
+        context_tokens = self.proj3(audio_embeds).reshape(
+            batch_size, self.context_tokens, self.output_dim
+        )
+        context_tokens = self.norm(context_tokens)
+        out_all = rearrange(
+            context_tokens, "(bz f) m c -> bz f m c", f=video_length
+        )
+
+        return out_all
+
+
+def reshape_tensor(x, heads):
+    bs, length, width = x.shape
+    # (bs, length, width) --> (bs, length, n_heads, dim_per_head)
+    x = x.view(bs, length, heads, -1)
+    # (bs, length, n_heads, dim_per_head) --> (bs, n_heads, length, dim_per_head)
+    x = x.transpose(1, 2)
+    # (bs, n_heads, length, dim_per_head) 
+    x = x.reshape(bs, heads, length, -1)
+    return x
+
+
+class PerceiverAttentionCA(nn.Module):
+    def __init__(self, *, dim=3072, dim_head=1024, heads=33):
+        super().__init__()
+        self.scale = dim_head ** -0.5
+        self.dim_head = dim_head
+        self.heads = heads
+        inner_dim = dim_head #* heads
+
+        self.norm1 = nn.LayerNorm(dim)
+        self.norm2 = nn.LayerNorm(dim)
+
+        self.to_q = nn.Linear(dim, inner_dim, bias=False)
+        self.to_kv = nn.Linear(dim, inner_dim * 2, bias=False)
+        self.to_out = nn.Linear(inner_dim, dim, bias=False)
+
+        import torch.nn.init as init
+        init.zeros_(self.to_out.weight)
+        if self.to_out.bias is not None:
+            init.zeros_(self.to_out.bias)
+
+    def forward(self, x, latents):
+        """
+        Args:
+            x (torch.Tensor): image features
+                shape (b, t, aa, D)
+            latent (torch.Tensor): latent features
+                shape (b, t, hw, D)
+        """
+        x = self.norm1(x)
+        latents = self.norm2(latents)
+        # print("latents shape: ", latents.shape)
+        # print("x shape: ", x.shape)
+        q = self.to_q(latents)
+        k, v = self.to_kv(x).chunk(2, dim=-1)
+
+
+        # attention
+        scale = 1 / math.sqrt(math.sqrt(self.dim_head))
+        weight = (q * scale) @ (k * scale).transpose(-2, -1)  # More stable with f16 than dividing afterwards
+        weight = torch.softmax(weight.float(), dim=-1).type(weight.dtype)
+        out = weight @ v
+        
+        # out = out.permute(0, 2, 1, 3)
+        return self.to_out(out)
+    #def forward(self, x, latents):
+    #    """
+    #    Args:
+    #        x (torch.Tensor): image features
+    #            shape (b, t, aa, D)
+    #        latent (torch.Tensor): latent features
+    #            shape (b, t, hw, D)
+    #    """
+    #    if get_sequence_parallel_state():
+    #        sp_size = nccl_info.sp_size
+    #        sp_rank = nccl_info.rank_within_group
+    #        print("rank:", latents.shape, sp_size, sp_rank)
+    #        latents = torch.chunk(latents, sp_size, dim=1)[sp_rank]
+
+    #    x = self.norm1(x)
+    #    latents = self.norm2(latents)
+    #    # print("latents shape: ", latents.shape)
+    #    # print("x shape: ", x.shape)
+    #    q = self.to_q(latents)
+    #    k, v = self.to_kv(x).chunk(2, dim=-1)
+
+    #    # print("q, k, v: ", q.shape, k.shape, v.shape)
+
+    #    # attention
+    #    #scale = 1 / math.sqrt(math.sqrt(self.dim_head))
+    #    #weight = (q * scale) @ (k * scale).transpose(-2, -1)  # More stable with f16 than dividing afterwards
+    #    #weight = torch.softmax(weight.float(), dim=-1).type(weight.dtype)
+    #    #out = weight @ v
+    #    def shrink_head(encoder_state, dim):
+    #        local_heads = encoder_state.shape[dim] // nccl_info.sp_size
+    #        return encoder_state.narrow(dim, nccl_info.rank_within_group * local_heads, local_heads)
+
+    #    if get_sequence_parallel_state():
+    #    # batch_size, seq_len, attn_heads, head_dim
+    #        q = all_to_all_4D(q, scatter_dim=2, gather_dim=1)  # [2, 32256, 24, 128]
+    #        k = shrink_head(k ,dim=2)
+    #        v = shrink_head(v ,dim=2)
+    #    qkv = torch.stack([query, key, value], dim=2)
+    #    attn = flash_attn_no_pad(qkv, causal=False, dropout_p=0.0, softmax_scale=None)
+    #    # out = out.permute(0, 2, 1, 3)
+    #    #b, s, a, d = attn.shape
+    #    #attn = attn.reshape(b, s, -1)
+    #        
+    #    out = self.to_out(attn)
+    #    if get_sequence_parallel_state():
+    #        out = all_gather(out, dim=1)
+    #    return out
--- a/hyvideo/modules/embed_layers.py
+++ b/hyvideo/modules/embed_layers.py
@ -53,10 +53,11 @@ class PatchEmbed(nn.Module):

    def forward(self, x):
        x = self.proj(x)
+        shape = x.shape
        if self.flatten:
            x = x.flatten(2).transpose(1, 2)  # BCHW -> BNC
        x = self.norm(x)
-        return x
+        return x, shape


 class TextProjection(nn.Module):
--- a/hyvideo/modules/models.py
+++ b/hyvideo/modules/models.py
@ -19,6 +19,7 @@ from .token_refiner import SingleTokenRefiner
 import numpy as np
 from mmgp import offload
 from wan.modules.attention import pay_attention
+from .audio_adapters import AudioProjNet2, PerceiverAttentionCA

 def get_linear_split_map():
    hidden_size = 3072
@ -589,7 +590,8 @@ class HYVideoDiffusionTransformer(ModelMixin, ConfigMixin):
        use_attention_mask: bool = True,
        dtype: Optional[torch.dtype] = None,
        device: Optional[torch.device] = None,
-        attention_mode: Optional[str] = "sdpa"
+        attention_mode: Optional[str] = "sdpa",
+        avatar = False,
    ):
        factory_kwargs = {"device": device, "dtype": dtype}
        super().__init__()
@ -708,6 +710,45 @@ class HYVideoDiffusionTransformer(ModelMixin, ConfigMixin):
            get_activation_layer("silu"),
            **factory_kwargs,
        )
+        avatar_audio = avatar
+        if avatar_audio:
+            self.ref_in = PatchEmbed(
+                self.patch_size, self.in_channels, self.hidden_size, **factory_kwargs
+                )
+
+            # -------------------- audio_proj_model --------------------
+            self.audio_proj = AudioProjNet2(seq_len=10, blocks=5, channels=384, intermediate_dim=1024, output_dim=3072, context_tokens=4)
+            
+            # -------------------- motion-embeder --------------------
+            self.motion_exp = TimestepEmbedder(
+                    self.hidden_size // 4,
+                    get_activation_layer("silu"),
+                    **factory_kwargs
+                )
+            self.motion_pose = TimestepEmbedder(
+                    self.hidden_size // 4,
+                    get_activation_layer("silu"),
+                    **factory_kwargs
+                )
+
+            self.fps_proj = TimestepEmbedder(
+                    self.hidden_size,
+                    get_activation_layer("silu"),
+                    **factory_kwargs
+                )
+            
+            self.before_proj = nn.Linear(self.hidden_size, self.hidden_size)
+
+            # -------------------- audio_insert_model --------------------
+            self.double_stream_list = [1, 3, 5, 7, 9, 11, 13, 15, 17, 19]
+            self.single_stream_list = []
+            self.double_stream_map = {str(i): j for j, i in enumerate(self.double_stream_list)}
+            self.single_stream_map = {str(i): j+len(self.double_stream_list) for j, i in enumerate(self.single_stream_list)}
+            
+            self.audio_adapter_blocks = nn.ModuleList([
+                PerceiverAttentionCA(dim=3072, dim_head=1024, heads=33) for _ in range(len(self.double_stream_list) + len(self.single_stream_list))
+            ])
+


    def lock_layers_dtypes(self, dtype = torch.float32):
@ -750,11 +791,17 @@ class HYVideoDiffusionTransformer(ModelMixin, ConfigMixin):
        guidance: torch.Tensor = None,  # Guidance for modulation, should be cfg_scale x 1000.
        pipeline=None,
        x_id = 0,
+        step_no = 0,
        callback = None,
+        audio_prompts = None,
+        motion_exp = None,
+        motion_pose = None,
+        fps = None,
+        face_mask = None,
    ) -> Union[torch.Tensor, Dict[str, torch.Tensor]]:
    
        img = x
-        batch_no, _, ot, oh, ow = x.shape
+        bsz, _, ot, oh, ow = x.shape
        del x
        txt = text_states   
        tt, th, tw = (
@ -765,6 +812,17 @@ class HYVideoDiffusionTransformer(ModelMixin, ConfigMixin):

        # Prepare modulation vectors.
        vec = self.time_in(t)
+        if motion_exp != None:
+            vec += self.motion_exp(motion_exp.view(-1)).view(bsz, -1)     # (b, 3072)
+        if motion_pose != None:
+            vec += self.motion_pose(motion_pose.view(-1)).view(bsz, -1)  # (b, 3072)
+        if fps != None:
+            vec += self.fps_proj(fps)   # (b, 3072)
+        if audio_prompts != None:
+            audio_feature_all = self.audio_proj(audio_prompts)
+            audio_feature_pad = audio_feature_all[:,:1].repeat(1,3,1,1) 
+            audio_feature_all_insert = torch.cat([audio_feature_pad, audio_feature_all], dim=1).view(bsz, ot, 16, 3072)
+            audio_feature_all = None

        if self.i2v_condition_type == "token_replace":
            token_replace_t = torch.zeros_like(t)
@ -777,7 +835,6 @@ class HYVideoDiffusionTransformer(ModelMixin, ConfigMixin):
            # token_replace_mask_txt = None

        # text modulation
-        # vec = vec + self.vector_in(text_states_2)
        vec_2 = self.vector_in(text_states_2)
        del text_states_2
        vec += vec_2
@ -793,12 +850,17 @@ class HYVideoDiffusionTransformer(ModelMixin, ConfigMixin):
                )

            # our timestep_embedding is merged into guidance_in(TimestepEmbedder)
-            vec = vec + self.guidance_in(guidance)
+            vec += self.guidance_in(guidance)

        # Embed image and text.
-        img = self.img_in(img)
-        if ref_latents != None:
-            ref_latents = self.img_in(ref_latents)
+        img, shape_mask = self.img_in(img)
+        if audio_prompts != None:
+            ref_latents_first = ref_latents[:, :, :1].clone()
+            ref_latents,_ = self.ref_in(ref_latents)
+            ref_latents_first,_ = self.img_in(ref_latents_first)
+        elif ref_latents != None:
+            ref_latents, _ = self.img_in(ref_latents)
+
        if self.text_projection == "linear":
            txt = self.txt_in(txt)
        elif self.text_projection == "single_refiner":
@ -808,7 +870,18 @@ class HYVideoDiffusionTransformer(ModelMixin, ConfigMixin):
                f"Unsupported text_projection: {self.text_projection}"
            )

-        if ref_latents == None:
+        if audio_prompts != None:
+            img += self.before_proj(ref_latents)
+            ref_length = ref_latents_first.shape[-2]          # [b s c]
+            img = torch.cat([ref_latents_first, img], dim=-2) # t c
+            img_len = img.shape[1]
+            mask_len = img_len - ref_length
+            if face_mask.shape[2] == 1:
+                face_mask = face_mask.repeat(1,1,ot,1,1)  # repeat if number of mask frame is 1
+            face_mask = torch.nn.functional.interpolate(face_mask, size=[ot, shape_mask[-2], shape_mask[-1]], mode="nearest")
+            # face_mask = face_mask.view(-1,mask_len,1).repeat(1,1,img.shape[-1]).type_as(img)
+            face_mask = face_mask.view(-1,mask_len,1).type_as(img)
+        elif ref_latents == None:
            ref_length  = None
        else:
            ref_length = ref_latents.shape[-2]
@ -828,13 +901,13 @@ class HYVideoDiffusionTransformer(ModelMixin, ConfigMixin):
            if x_id == 0:
                self.should_calc = True
                inp = img[0:1] 
-                vec_ = vec 
+                vec_ = vec[0:1] 
                ( img_mod1_shift, img_mod1_scale, _ , _ , _ , _ , ) = self.double_blocks[0].img_mod(vec_).chunk(6, dim=-1)
                normed_inp = self.double_blocks[0].img_norm1(inp)
                normed_inp = normed_inp.to(torch.bfloat16)
                modulated_inp = modulate( normed_inp, shift=img_mod1_shift, scale=img_mod1_scale )
                del normed_inp, img_mod1_shift, img_mod1_scale
-                if self.teacache_counter <= self.teacache_start_step or self.teacache_counter == self.num_steps-1:
+                if step_no <= self.teacache_start_step or step_no == self.num_steps-1:
                    self.accumulated_rel_l1_distance = 0
                else: 
                    coefficients = [7.33226126e+02, -4.01131952e+02,  6.75869174e+01, -3.14987800e+00, 9.61237896e-02]
@ -846,9 +919,6 @@ class HYVideoDiffusionTransformer(ModelMixin, ConfigMixin):
                    else:
                        self.accumulated_rel_l1_distance = 0
                self.previous_modulated_input = modulated_inp  
-                self.teacache_counter += 1
-                if self.teacache_counter == self.num_steps:
-                    self.teacache_counter = 0
        else:
            self.should_calc = True

@ -859,7 +929,7 @@ class HYVideoDiffusionTransformer(ModelMixin, ConfigMixin):
                self.previous_residual[x_id] = None
                ori_img = img[0:1].clone()
            # --------------------- Pass through DiT blocks ------------------------
-            for _, block in enumerate(self.double_blocks):
+            for layer_num, block in enumerate(self.double_blocks):
                for i in range(len(img)):
                    if callback != None:
                        callback(-1, None, False, True)
@ -880,6 +950,16 @@ class HYVideoDiffusionTransformer(ModelMixin, ConfigMixin):

                    img[i], txt[i] = block(*double_block_args)
                    double_block_args = None
+                    # insert audio feature to img 
+                    if audio_prompts != None: 
+                        audio_adapter = getattr(self.double_blocks[layer_num], "audio_adapter", None)
+                        if audio_adapter != None:
+                            real_img = img[i:i+1,ref_length:].view(1, ot, -1, 3072)  
+                            real_img = audio_adapter(audio_feature_all_insert[i:i+1], real_img).view(1, -1, 3072)
+                            real_img *= face_mask[i:i+1]
+                            img[i:i+1, ref_length:] += real_img
+                            real_img = None
+

            for _, block in enumerate(self.single_blocks):
                for i in range(len(img)):
@ -932,6 +1012,7 @@ class HYVideoDiffusionTransformer(ModelMixin, ConfigMixin):
        img = torch.cat(img_list)
        img_list = None

+        # img = self.unpatchify(img, tt, th, tw)
        img = self.unpatchify(img, tt, th, tw)

        return img
@ -1015,5 +1096,14 @@ HUNYUAN_VIDEO_CONFIG = {
        "heads_num": 24,
        "mlp_width_ratio": 4,
    },
+    'HYVideo-T/2-avatar': {                                                                       #   9.0B   / 12.5B
+        'mm_double_blocks_depth': 20,
+        'mm_single_blocks_depth': 40,
+        'rope_dim_list': [16, 56, 56],
+        'hidden_size': 3072,
+        'heads_num': 24,
+        'mlp_width_ratio': 4,
+        'avatar': True,
+    },
    
 }
--- a/wgp.py
+++ b/wgp.py
@ -190,15 +190,19 @@ def process_prompt_and_add_tasks(state, model_choice):
        if  video_length > sliding_window_size:
            gr.Info(f"The Number of Frames to generate ({video_length}) is greater than the Sliding Window Size ({sliding_window_size}) , multiple Windows will be generated")

-    if "phantom" in model_filename or "hunyuan_video_custom" in model_filename:
+    if "phantom" in model_filename or "hunyuan_video_custom" in model_filename or "hunyuan_video_avatar" in model_filename:
        image_refs = inputs["image_refs"]
-
+        audio_guide  = inputs["audio_guide"]
        if image_refs  == None :
            gr.Info("You must provide an Image Reference") 
            return
-        if len(image_refs) > 1 and "hunyuan_video_custom" in model_filename:
-            gr.Info("Only one Image Reference (a person) is supported for the moment by Hunyuan Custom") 
+        if len(image_refs) > 1 and ("hunyuan_video_custom" in model_filename or "hunyuan_video_avatar" in model_filename):
+            gr.Info("Only one Image Reference (a person) is supported for the moment by Hunyuan Custom / Avatar") 
            return
+        if audio_guide == None and "hunyuan_video_avatar" in model_filename:
+            gr.Info("You must provide an audio file") 
+            return
+
        if any(isinstance(image[0], str) for image in image_refs) :
            gr.Info("Reference Image should be an Image") 
            return
@ -1539,7 +1543,9 @@ wan_choices_i2v=["ckpts/wan2.1_image2video_480p_14B_mbf16.safetensors", "ckpts/w
 ltxv_choices= ["ckpts/ltxv_0.9.7_13B_dev_bf16.safetensors", "ckpts/ltxv_0.9.7_13B_dev_quanto_bf16_int8.safetensors", "ckpts/ltxv_0.9.7_13B_distilled_lora128_bf16.safetensors"]

 hunyuan_choices= ["ckpts/hunyuan_video_720_bf16.safetensors", "ckpts/hunyuan_video_720_quanto_int8.safetensors", "ckpts/hunyuan_video_i2v_720_bf16v2.safetensors", "ckpts/hunyuan_video_i2v_720_quanto_int8v2.safetensors",
-                 "ckpts/hunyuan_video_custom_720_bf16.safetensors", "ckpts/hunyuan_video_custom_720_quanto_bf16_int8.safetensors" ]
+                 "ckpts/hunyuan_video_custom_720_bf16.safetensors", "ckpts/hunyuan_video_custom_720_quanto_bf16_int8.safetensors",
+                 "ckpts/hunyuan_video_avatar_720_bf16.safetensors", "ckpts/hunyuan_video_avatar_720_quanto_bf16_int8.safetensors",
+                 ]

 transformer_choices = wan_choices_t2v + wan_choices_i2v + ltxv_choices + hunyuan_choices
 def get_dependent_models(model_filename, quantization, dtype_policy ):
@ -1549,12 +1555,13 @@ def get_dependent_models(model_filename, quantization, dtype_policy ):
        return [get_model_filename("ltxv_13B", quantization, dtype_policy)]
    else:
        return []
-model_types = [ "t2v_1.3B", "t2v", "i2v", "i2v_720p", "flf2v_720p", "vace_1.3B","vace_14B","moviigen", "phantom_1.3B", "phantom_14B", "fantasy",  "fun_inp_1.3B", "fun_inp", "recam_1.3B",  "sky_df_1.3B", "sky_df_14B", "sky_df_720p_14B", "ltxv_13B", "ltxv_13B_distilled", "hunyuan", "hunyuan_i2v", "hunyuan_custom"]
+model_types = [ "t2v_1.3B", "t2v", "i2v", "i2v_720p", "flf2v_720p", "vace_1.3B","vace_14B","moviigen", "phantom_1.3B", "phantom_14B", "fantasy",  "fun_inp_1.3B", "fun_inp", "recam_1.3B",  "sky_df_1.3B", "sky_df_14B", "sky_df_720p_14B", "ltxv_13B", "ltxv_13B_distilled", "hunyuan", "hunyuan_i2v", "hunyuan_custom", "hunyuan_avatar"]
 model_signatures = {"t2v": "text2video_14B", "t2v_1.3B" : "text2video_1.3B",   "fun_inp_1.3B" : "Fun_InP_1.3B",  "fun_inp" :  "Fun_InP_14B", 
                    "i2v" : "image2video_480p", "i2v_720p" : "image2video_720p" , "vace_1.3B" : "Vace_1.3B", "vace_14B" : "Vace_14B","recam_1.3B": "recammaster_1.3B", 
                    "flf2v_720p" : "FLF2V_720p", "sky_df_1.3B" : "sky_reels2_diffusion_forcing_1.3B", "sky_df_14B" : "sky_reels2_diffusion_forcing_14B", 
                    "sky_df_720p_14B" : "sky_reels2_diffusion_forcing_720p_14B",  "moviigen" :"moviigen",
-                     "phantom_1.3B" : "phantom_1.3B", "phantom_14B" : "phantom_14B", "fantasy" : "fantasy", "ltxv_13B" : "ltxv_0.9.7_13B_dev", "ltxv_13B_distilled" : "ltxv_0.9.7_13B_distilled",  "hunyuan" : "hunyuan_video_720", "hunyuan_i2v" : "hunyuan_video_i2v_720", "hunyuan_custom" : "hunyuan_video_custom" }
+                    "phantom_1.3B" : "phantom_1.3B", "phantom_14B" : "phantom_14B", "fantasy" : "fantasy", "ltxv_13B" : "ltxv_0.9.7_13B_dev", "ltxv_13B_distilled" : "ltxv_0.9.7_13B_distilled", 
+                    "hunyuan" : "hunyuan_video_720", "hunyuan_i2v" : "hunyuan_video_i2v_720", "hunyuan_custom" : "hunyuan_video_custom", "hunyuan_avatar" : "hunyuan_video_avatar"  }


 def get_model_type(model_filename):
@ -1639,7 +1646,10 @@ def get_model_name(model_filename, description_container = [""]):
        description = "A good looking image 2 video model, but not so good in prompt adherence."
    elif "hunyuan_video_custom" in model_filename:
        model_name = "Hunyuan Video Custom 720p 13B"
-        description = "The Hunyuan Video Custom model is proably the best model  to transfer people (only people for the momment) as it is quite good to keep their identity. However it is slow as to get good results, you need to generate 720p videos with 30 steps."
+        description = "The Hunyuan Video Custom model is probably the best model  to transfer people (only people for the momment) as it is quite good to keep their identity. However it is slow as to get good results, you need to generate 720p videos with 30 steps."
+    elif "hunyuan_video_avatar" in model_filename:
+        model_name = "Hunyuan Video Avatar 720p 13B"
+        description = "With the Hunyuan Video Avatar model you can animate a person based on the content of an audio input. Please note that the video generator works by processing 128 frames segment at a time (even if you ask less). The good news is that it will concatenate multiple segments for long video generation (max 3 segments recommended as the quality will get worse)."
    else:
        model_name = "Wan2.1 text2video"
        model_name += " 14B" if "14B" in model_filename else " 1.3B"
@ -1758,7 +1768,14 @@ def get_default_settings(filename):
            ui_defaults.update({
                "guidance_scale": 7.5,
                "flow_shift": 13,
-                "resolution": "1280x720" 
+                "resolution": "1280x720",
+            })
+        elif get_model_type(filename) in ("hunyuan_avatar"):
+            ui_defaults.update({
+                "guidance_scale": 7.5,
+                "flow_shift": 5,
+                "tea_cache_start_step_perc": 25, 
+                "video_length": 129,
            })
        elif get_model_type(filename) in ("vace_14B"):
            ui_defaults.update({
@ -1954,8 +1971,13 @@ def download_models(transformer_filename):
        text_encoder_filename = get_hunyuan_text_encoder_filename(text_encoder_quantization)    
        model_def = {  
            "repoId" : "DeepBeepMeep/HunyuanVideo", 
-            "sourceFolderList" :  [ "llava-llama-3-8b", "clip_vit_large_patch14", ""  ],
-            "fileList" :[ ["config.json", "special_tokens_map.json", "tokenizer.json", "tokenizer_config.json", "preprocessor_config.json"] + computeList(text_encoder_filename) , ["config.json", "merges.txt", "model.safetensors", "preprocessor_config.json", "special_tokens_map.json", "tokenizer.json", "tokenizer_config.json", "vocab.json"],  [ "hunyuan_video_720_quanto_int8_map.json", "hunyuan_video_custom_VAE_fp32.safetensors", "hunyuan_video_custom_VAE_config.json", "hunyuan_video_VAE_fp32.safetensors", "hunyuan_video_VAE_config.json" , "hunyuan_video_720_quanto_int8_map.json"   ] + computeList(transformer_filename)  ]
+            "sourceFolderList" :  [ "llava-llama-3-8b", "clip_vit_large_patch14",  "whisper-tiny" , "det_align", ""  ],
+            "fileList" :[ ["config.json", "special_tokens_map.json", "tokenizer.json", "tokenizer_config.json", "preprocessor_config.json"] + computeList(text_encoder_filename) ,
+                          ["config.json", "merges.txt", "model.safetensors", "preprocessor_config.json", "special_tokens_map.json", "tokenizer.json", "tokenizer_config.json", "vocab.json"],
+                          ["config.json", "model.safetensors", "preprocessor_config.json", "special_tokens_map.json", "tokenizer_config.json"],
+                          ["detface.pt"],
+                          [ "hunyuan_video_720_quanto_int8_map.json", "hunyuan_video_custom_VAE_fp32.safetensors", "hunyuan_video_custom_VAE_config.json", "hunyuan_video_VAE_fp32.safetensors", "hunyuan_video_VAE_config.json" , "hunyuan_video_720_quanto_int8_map.json"   ] + computeList(transformer_filename)  
+                         ]
        } 

    else:
@ -2122,6 +2144,14 @@ def load_hunyuan_model(model_filename, quantizeTransformer = False, dtype = torc

    pipe = { "transformer" : hunyuan_model.model, "text_encoder" : hunyuan_model.text_encoder, "text_encoder_2" : hunyuan_model.text_encoder_2, "vae" : hunyuan_model.vae  }

+    if hunyuan_model.wav2vec != None:
+        pipe["wav2vec"] = hunyuan_model.wav2vec
+
+
+    # if hunyuan_model.align_instance != None:
+    #     pipe["align_instance"] = hunyuan_model.align_instance.facedet.model
+
+
    from hyvideo.modules.models import get_linear_split_map

    split_linear_modules_map = get_linear_split_map()
@ -2818,9 +2848,13 @@ def generate_video(
    hunyuan_t2v = "hunyuan_video_720" in model_filename
    hunyuan_i2v = "hunyuan_video_i2v" in model_filename
    hunyuan_custom = "hunyuan_video_custom" in model_filename
+    hunyuan_avatar = "hunyuan_video_avatar" in model_filename
+    fantasy = "fantasy" in model_filename
    if diffusion_forcing or hunyuan_t2v or hunyuan_i2v or hunyuan_custom:
        fps = 24
-    elif audio_guide != None:
+    elif hunyuan_avatar:
+        fps = 25
+    elif fantasy:
        fps = 23
    elif ltxv:
        fps = 30
@ -2829,11 +2863,13 @@ def generate_video(
    latent_size = 8 if ltxv else 4

    original_image_refs = image_refs 
-    if image_refs != None and len(image_refs) > 0 and (hunyuan_custom or phantom or vace):
-        send_cmd("progress", [0, get_latest_status(state, "Removing Images References Background")])
+    if image_refs != None and len(image_refs) > 0 and (hunyuan_custom or phantom or hunyuan_avatar or vace):
+        if hunyuan_avatar: remove_background_images_ref = 0
+        if remove_background_images_ref > 0:
+            send_cmd("progress", [0, get_latest_status(state, "Removing Images References Background")])
        os.environ["U2NET_HOME"] = os.path.join(os.getcwd(), "ckpts", "rembg")
        from wan.utils.utils import resize_and_remove_background
-        image_refs = resize_and_remove_background(image_refs, width, height, remove_background_images_ref, fit_into_canvas= not vace)
+        image_refs = resize_and_remove_background(image_refs, width, height, remove_background_images_ref, fit_into_canvas= not (vace or hunyuan_avatar) ) # no fit for vace ref images as it is done later
        update_task_thumbnails(task, locals())
        send_cmd("output")

@ -2866,13 +2902,14 @@ def generate_video(
    audio_proj_split = None
    audio_scale = None
    audio_context_lens = None
-    if audio_guide != None:
+    if (fantasy or hunyuan_avatar) and audio_guide != None:
        from fantasytalking.infer import parse_audio
        import librosa
        duration = librosa.get_duration(path=audio_guide)
-        current_video_length = min(int(fps * duration // 4) * 4 + 5, current_video_length)    
-        audio_proj_split, audio_context_lens = parse_audio(audio_guide, num_frames= current_video_length, fps= fps, device= processing_device  )
-        audio_scale = 1.0
+        current_video_length = min(int(fps * duration // 4) * 4 + 5, current_video_length)
+        if fantasy:
+            audio_proj_split, audio_context_lens = parse_audio(audio_guide, num_frames= current_video_length, fps= fps, device= processing_device  )
+            audio_scale = 1.0

    import random
    if seed == None or seed <0:
@ -2990,7 +3027,7 @@ def generate_video(
            if reuse_frames > 0:                
                return_latent_slice = slice(-(reuse_frames - 1 + discard_last_frames ) // latent_size, None if discard_last_frames == 0 else -(discard_last_frames // latent_size) )

-            if hunyuan_custom:
+            if hunyuan_custom or hunyuan_avatar:
                src_ref_images  = image_refs
            elif phantom:
                src_ref_images = image_refs.copy() if image_refs != None else None
@ -3098,6 +3135,7 @@ def generate_video(
                    cfg_star_switch = cfg_star_switch,
                    cfg_zero_step = cfg_zero_step,
                    audio_cfg_scale= audio_guidance_scale,
+                    audio_guide=audio_guide,
                    audio_proj= audio_proj_split,
                    audio_scale= audio_scale,
                    audio_context_lens= audio_context_lens,
@ -4502,6 +4540,7 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non
            hunyuan_t2v = "hunyuan_video_720" in model_filename
            hunyuan_i2v = "hunyuan_video_i2v" in model_filename
            hunyuan_video_custom = "hunyuan_video_custom" in model_filename
+            hunyuan_video_avatar = "hunyuan_video_avatar" in model_filename
            sliding_window_enabled = vace or diffusion_forcing or ltxv
            new_line_text = "each new line of prompt will be used for a window" if sliding_window_enabled else "each new line of prompt will generate a new video"

@ -4575,7 +4614,7 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non
                    model_mode = gr.Dropdown(value=None, visible=False)
                    keep_frames_video_source = gr.Text(visible=False)

-            with gr.Column(visible= vace or phantom or hunyuan_video_custom) as video_prompt_column: 
+            with gr.Column(visible= vace or phantom or hunyuan_video_custom or hunyuan_video_avatar) as video_prompt_column: 
                video_prompt_type_value= ui_defaults.get("video_prompt_type","")
                video_prompt_type = gr.Text(value= video_prompt_type_value, visible= False)
                with gr.Row():
@ -4624,14 +4663,14 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non
                        ("Keep it for first Image (landscape) and remove it for other Images (objects / faces)", 2),
                    ],
                    value=ui_defaults.get("remove_background_images_ref",1),
-                    label="Remove Background of Images References", scale = 3, visible= "I" in video_prompt_type_value
+                    label="Remove Background of Images References", scale = 3, visible= "I" in video_prompt_type_value and not hunyuan_video_avatar
                )

                # remove_background_images_ref = gr.Checkbox(value=ui_defaults.get("remove_background_images_ref",1), label= "Remove Background of Images References", visible= "I" in video_prompt_type_value, scale =1 ) 


                video_mask = gr.Video(label= "Video Mask (for Inpainting or Outpaing, white pixels = Mask)", visible= "M" in video_prompt_type_value, value= ui_defaults.get("video_mask", None)) 
-            audio_guide = gr.Audio(value= ui_defaults.get("audio_guide", None), type="filepath", label="Voice to follow", show_download_button= True, visible= fantasy )
+            audio_guide = gr.Audio(value= ui_defaults.get("audio_guide", None), type="filepath", label="Voice to follow", show_download_button= True, visible= fantasy or hunyuan_video_avatar )

            advanced_prompt = advanced_ui
            prompt_vars=[]
@ -4720,6 +4759,8 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non
                    video_length = gr.Slider(17, 737, value=ui_defaults.get("video_length", 81), step=4, label="Number of frames (16 = 1s)", interactive= True)
                elif fantasy:
                    video_length = gr.Slider(5, 233, value=ui_defaults.get("video_length", 81), step=4, label="Number of frames (23 = 1s)", interactive= True)
+                elif hunyuan_video_avatar:
+                    video_length = gr.Slider(5, 401, value=ui_defaults.get("video_length", 81), step=4, label="Number of frames (25 = 1s)", interactive= True)
                elif hunyuan_t2v or hunyuan_i2v:
                    video_length = gr.Slider(5, 337, value=ui_defaults.get("video_length", 97), step=4, label="Number of frames (24 = 1s)", interactive= True)
                else:
@ -4809,7 +4850,7 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non
                        )

                with gr.Tab("Quality", visible = not ltxv) as quality_tab:
-                        with gr.Column(visible = not (hunyuan_i2v or hunyuan_t2v or hunyuan_video_custom) ) as skip_layer_guidance_row:
+                        with gr.Column(visible = not (hunyuan_i2v or hunyuan_t2v or hunyuan_video_custom or hunyuan_video_avatar) ) as skip_layer_guidance_row:
                            gr.Markdown("<B>Skip Layer Guidance (improves video quality)</B>")
                            with gr.Row():
                                slg_switch = gr.Dropdown(