Added Hunyuan Custom Audio and Edit support

2026-01-11 16:53:34 +00:00 · 2025-06-11 01:51:25 +02:00 · 2025-06-11 01:51:25 +02:00 · 43aa414eaf
commit 43aa414eaf
parent 026c2b0cbb
12 changed files with 395 additions and 113 deletions
--- a/README.md
+++ b/README.md
@ -20,11 +20,16 @@ WanGP supports the Wan (and derived models), Hunyuan Video and LTV Video models
 **Follow DeepBeepMeep on Twitter/X to get the Latest News**: https://x.com/deepbeepmeep

 ## 🔥 Latest Updates
-### May 28 2025: WanGP v5.41
+### June 11 2025: WanGP v5.5
+👋 *Hunyuan Video Custom Audio*: it is similar to Hunyuan Video Avatar except there isn't any lower limit on the number of frames and you can use your reference images in a different context than the image itself\
+*Hunyuan Video Custom Edit*: Hunyuan Video Controlnet, use it to do inpainting and replace a person in a video while still keeping his poses. Similar to Vace but less restricted than the Wan models in terms of content...
+
+
+### June 6 2025: WanGP v5.41
 👋 Bonus release: Support for **AccVideo** Lora to speed up x2 Video generations in Wan models. Check the Loras documentation to get the usage instructions of AccVideo.\
 You will need to do a *pip install -r requirements.txt*

-### May 28 2025: WanGP v5.4
+### June 6 2025: WanGP v5.4
 👋 World Exclusive : **Hunyuan Video Avatar** Support ! You won't need 80 GB of VRAM nor 32 GB oF VRAM, just 10 GB of VRAM will be sufficient to generate up to 15s of high quality speech / song driven Video at a high speed with no quality degradation. Support for TeaCache included.\
 Here is a link to the original repo where you will find some very interesting documentation and examples. https://github.com/Tencent-Hunyuan/HunyuanVideo-Avatar. Kudos to the Hunyuan Video Avatar team for the best model of its kind.\
 Also many thanks to Reevoy24 for his repackaging / completing the documentation
--- a/docs/CHANGELOG.md
+++ b/docs/CHANGELOG.md
@ -1,10 +1,14 @@
 # Changelog

 ## 🔥 Latest News
-### May 28 2025: WanGP v5.41
+### June 11 2025: WanGP v5.5
+👋 *Hunyuan Video Custom Audio*: it is similar to Hunyuan Video Avatar excpet there isn't any lower limit on the number of frames and you can use your reference images in a different context than the image itself\
+*Hunyuan Video Custom Edit*: Hunyuan Video Controlnet, use it to do inpainting and replace a person in a video while still keeping his poses. Similar to Vace but less restricted than the Wan models in terms of content...
+
+### June 6 2025: WanGP v5.41
 👋 Bonus release: Support for **AccVideo** Lora to speed up x2 Video generations in Wan models. Check the Loras documentation to get the usage instructions of AccVideo.

-### May 28 2025: WanGP v5.4
+### June 6 2025: WanGP v5.4
 👋 World Exclusive : Hunyuan Video Avatar Support ! You won't need 80 GB of VRAM nor 32 GB oF VRAM, just 10 GB of VRAM will be sufficient to generate up to 15s of high quality speech / song driven Video at a high speed with no quality degradation. Support for TeaCache included.

 ### May 26, 2025: WanGP v5.3
--- a/hyvideo/data_kits/audio_preprocessor.py
+++ b/hyvideo/data_kits/audio_preprocessor.py
@ -54,6 +54,10 @@ def encode_audio(wav2vec, audio_feats, fps, num_frames=129):
    elif fps == 12.5:
        start_ts = [0]
        step_ts = [2]
+    else:
+        start_ts = [0]
+        step_ts = [1]
+
    num_frames = min(num_frames, 400)
    audio_feats = wav2vec.encoder(audio_feats.unsqueeze(0)[:, :, :3000], output_hidden_states=True).hidden_states
    audio_feats = torch.stack(audio_feats, dim=2)
--- a/hyvideo/diffusion/pipelines/pipeline_hunyuan_video.py
+++ b/hyvideo/diffusion/pipelines/pipeline_hunyuan_video.py
@ -774,7 +774,10 @@ class HunyuanVideoPipeline(DiffusionPipeline):
        # uncond_ref_latents: Optional[torch.Tensor] = None,
        pixel_value_llava: Optional[torch.Tensor] = None,
        uncond_pixel_value_llava: Optional[torch.Tensor] = None,
+        bg_latents: Optional[torch.Tensor] = None,
+        audio_prompts: Optional[torch.Tensor] = None,
        ip_cfg_scale: float = 0.0,
+        audio_strength: float = 1.0,
        use_deepcache: int = 1,
        num_videos_per_prompt: Optional[int] = 1,
        eta: float = 0.0,
@ -922,6 +925,9 @@ class HunyuanVideoPipeline(DiffusionPipeline):
        #         "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
        #     )

+
+        if self._interrupt:
+            return [None]
        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs

@ -968,7 +974,6 @@ class HunyuanVideoPipeline(DiffusionPipeline):
        self._guidance_rescale = guidance_rescale
        self._clip_skip = clip_skip
        self._cross_attention_kwargs = cross_attention_kwargs
-        self._interrupt = False

        # 2. Define call parameters
        if prompt is not None and isinstance(prompt, str):
@ -1057,6 +1062,12 @@ class HunyuanVideoPipeline(DiffusionPipeline):
                    prompt_mask[0] = torch.cat([torch.ones((1, prompt_mask[0].sum() - 575)).to(prompt_mask), 
                                                torch.zeros((1, prompt_mask.shape[1] - prompt_mask[0].sum() + 575)).to(prompt_mask)], dim=1)

+            if bg_latents is not None:
+                bg_latents = torch.cat([bg_latents, bg_latents], dim=0)
+
+            if audio_prompts is not None:
+                audio_prompts = torch.cat([torch.zeros_like(audio_prompts), audio_prompts], dim=0)
+
        if ip_cfg_scale>0:
            prompt_embeds = torch.cat([prompt_embeds, prompt_embeds[1:]])
            prompt_embeds_2 = torch.cat([prompt_embeds_2, prompt_embeds_2[1:]])
@ -1263,6 +1274,9 @@ class HunyuanVideoPipeline(DiffusionPipeline):
                                pipeline=self,
                                x_id=j,
                                step_no=i,
+                                bg_latents=bg_latents[j].unsqueeze(0) if bg_latents!=None else None,
+                                audio_prompts=audio_prompts[j].unsqueeze(0) if audio_prompts!=None else None,
+                                audio_strength=audio_strength,
                                callback = callback,
                            )
                            if self._interrupt:
@ -1292,6 +1306,9 @@ class HunyuanVideoPipeline(DiffusionPipeline):
                            guidance=guidance_expand,
                            pipeline=self,
                            step_no=i,
+                            bg_latents=bg_latents,
+                            audio_prompts=audio_prompts,
+                            audio_strength=audio_strength,
                            callback = callback,
                        )
                        if self._interrupt:
--- a/hyvideo/diffusion/pipelines/pipeline_hunyuan_video_audio.py
+++ b/hyvideo/diffusion/pipelines/pipeline_hunyuan_video_audio.py
@ -908,6 +908,10 @@ class HunyuanVideoAudioPipeline(DiffusionPipeline):
                second element is a list of `bool`s indicating whether the corresponding generated image contains
                "not-safe-for-work" (nsfw) content.
        """
+
+        if self._interrupt:
+            return [None]
+
        callback = kwargs.pop("callback", None)
        callback_steps = kwargs.pop("callback_steps", None)
        if callback_steps is not None:
@ -956,7 +960,6 @@ class HunyuanVideoAudioPipeline(DiffusionPipeline):
        self._guidance_rescale = guidance_rescale
        self._clip_skip = clip_skip
        self._cross_attention_kwargs = cross_attention_kwargs
-        self._interrupt = False

        # 2. Define call parameters
        if prompt is not None and isinstance(prompt, str):
--- a/hyvideo/hunyuan.py
+++ b/hyvideo/hunyuan.py
@ -235,13 +235,13 @@ def patched_llava_forward(
        image_hidden_states=image_features if pixel_values is not None else None,
    )

-def adapt_avatar_model(model):
+def adapt_model(model, audio_block_name):
    modules_dict= { k: m for k, m in model.named_modules()}
    for model_layer, avatar_layer in model.double_stream_map.items():
-        module = modules_dict[f"audio_adapter_blocks.{avatar_layer}"]
+        module = modules_dict[f"{audio_block_name}.{avatar_layer}"]
        target = modules_dict[f"double_blocks.{model_layer}"]
        setattr(target, "audio_adapter", module )
-    delattr(model, "audio_adapter_blocks")
+    delattr(model, audio_block_name)

 class DataPreprocess(object):
    def __init__(self):
@ -329,17 +329,25 @@ class Inference(object):
        precision = "bf16"
        vae_precision = "fp32" if VAE_dtype == torch.float32 else "bf16" 
        embedded_cfg_scale = 6
+        filepath = model_filepath[0]
        i2v_condition_type = None
-        i2v_mode = "i2v" in model_filepath[0]
+        i2v_mode = "i2v" in filepath 
        custom = False
+        custom_audio = False
        avatar = False 
        if i2v_mode:
            model_id = "HYVideo-T/2"
            i2v_condition_type = "token_replace"
-        elif "custom" in model_filepath[0]:
+        elif "custom" in filepath:
+            if "audio" in filepath:
+                model_id = "HYVideo-T/2-custom-audio"
+                custom_audio = True
+            elif "edit" in filepath:
+                model_id = "HYVideo-T/2-custom-edit"
+            else: 
                model_id = "HYVideo-T/2-custom"
            custom = True
-        elif "avatar" in model_filepath[0]:
+        elif "avatar" in filepath :
            model_id = "HYVideo-T/2-avatar"
            text_len = 256
            avatar = True
@ -376,11 +384,11 @@ class Inference(object):
        from mmgp import offload
        # model = Inference.load_state_dict(args, model, model_filepath)

-        # model_filepath ="c:/temp/avatar/mp_rank_00_model_states.pt"
+        # model_filepath ="c:/temp/hc/mp_rank_00_model_states_video.pt"
        offload.load_model_data(model, model_filepath, pinToMemory = pinToMemory, partialPinning = partialPinning)
        pass
-        # offload.save_model(model, "hunyuan_video_avatar_720_bf16.safetensors")
-        # offload.save_model(model, "hunyuan_video_avatar_720_quanto_bf16_int8.safetensors", do_quantize= True)
+        # offload.save_model(model, "hunyuan_video_avatar_edit_720_bf16.safetensors")
+        # offload.save_model(model, "hunyuan_video_avatar_edit_720_quanto_bf16_int8.safetensors", do_quantize= True)

        model.mixed_precision = mixed_precision_transformer

@ -472,15 +480,17 @@ class Inference(object):
        wav2vec = None
        align_instance = None

-        if avatar:
+        if avatar or custom_audio:
            feature_extractor = AutoFeatureExtractor.from_pretrained("ckpts/whisper-tiny/")
            wav2vec = WhisperModel.from_pretrained("ckpts/whisper-tiny/").to(device="cpu", dtype=torch.float32)
            wav2vec._model_dtype = torch.float32
            wav2vec.requires_grad_(False)
+        if avatar:
            align_instance = AlignImage("cuda", det_path="ckpts/det_align/detface.pt")
            align_instance.facedet.model.to("cpu")
-
-            adapt_avatar_model(model)
+            adapt_model(model, "audio_adapter_blocks")
+        elif custom_audio:
+            adapt_model(model, "audio_models")

        return cls(
            i2v=i2v_mode,
@ -600,7 +610,7 @@ class HunyuanVideoSampler(Inference):
 
        return pipeline

-    def get_rotary_pos_embed_new(self, video_length, height, width, concat_dict={}):
+    def get_rotary_pos_embed_new(self, video_length, height, width, concat_dict={}, enable_riflex = False):
        target_ndim = 3
        ndim = 5 - 2
        latents_size = [(video_length-1)//4+1 , height//8, width//8]
@ -628,7 +638,10 @@ class HunyuanVideoSampler(Inference):
                                                    theta=256, 
                                                    use_real=True,
                                                    theta_rescale_factor=1,
-                                                    concat_dict=concat_dict)
+                                                    concat_dict=concat_dict,
+                                                    L_test = (video_length - 1) // 4 + 1,
+                                                    enable_riflex = enable_riflex
+                                                    )
        return freqs_cos, freqs_sin
        
    def get_rotary_pos_embed(self, video_length, height, width, enable_riflex = False):
@ -687,6 +700,9 @@ class HunyuanVideoSampler(Inference):
        input_prompt,
        input_ref_images = None,
        audio_guide = None,
+        input_frames = None,
+        input_masks = None,
+        input_video = None,        
        fps = 24,
        height=192,
        width=336,
@ -701,13 +717,14 @@ class HunyuanVideoSampler(Inference):
        num_videos_per_prompt=1,
        i2v_resolution="720p",
        image_start=None,
-        enable_riflex = False,
+        enable_RIFLEx = False,
        i2v_condition_type: str = "token_replace",
        i2v_stability=True,
        VAE_tile_size = None,
        joint_pass = False,
        cfg_star_switch = False,
        fit_into_canvas = True,
+        conditioning_latents_size = 0,
        **kwargs,
    ):

@ -777,6 +794,7 @@ class HunyuanVideoSampler(Inference):
        target_height = align_to(height, 16)
        target_width = align_to(width, 16)
        target_frame_num = frame_num
+        audio_strength = 1

        if input_ref_images  != None:
            # ip_cfg_scale = 3.0
@ -862,7 +880,7 @@ class HunyuanVideoSampler(Inference):
        # ========================================================================

        if input_ref_images == None:
-            freqs_cos, freqs_sin = self.get_rotary_pos_embed(target_frame_num, target_height, target_width, enable_riflex)
+            freqs_cos, freqs_sin = self.get_rotary_pos_embed(target_frame_num, target_height, target_width, enable_RIFLEx)
        else:
            if self.avatar:
                w, h = input_ref_images.size
@ -873,8 +891,13 @@ class HunyuanVideoSampler(Inference):
                concat_dict = {'mode': 'timecat', 'bias': -1} 
                freqs_cos, freqs_sin = self.get_rotary_pos_embed_new(129, target_height, target_width, concat_dict)
            else:
+                if input_frames != None:
+                    target_height, target_width = input_frames.shape[-3:-1]
+                elif input_video != None:
+                    target_height, target_width = input_video.shape[-2:]
+
                concat_dict = {'mode': 'timecat-w', 'bias': -1} 
-                freqs_cos, freqs_sin = self.get_rotary_pos_embed_new(target_frame_num, target_height, target_width, concat_dict)
+                freqs_cos, freqs_sin = self.get_rotary_pos_embed_new(target_frame_num, target_height, target_width, concat_dict, enable_RIFLEx)

        n_tokens = freqs_cos.shape[0]

@ -892,7 +915,35 @@ class HunyuanVideoSampler(Inference):

        ref_latents, uncond_audio_prompts, audio_prompts, face_masks, motion_exp, motion_pose = None, None, None, None, None, None

-        if audio_guide != None:
+
+        bg_latents = None
+        if input_video != None:
+            pixel_value_bg = input_video.unsqueeze(0)
+            pixel_value_mask =  torch.zeros_like(input_video).unsqueeze(0)
+        if input_frames != None:
+            pixel_value_video_bg = input_frames.permute(-1,0,1,2).unsqueeze(0).float()
+            pixel_value_video_mask = input_masks.unsqueeze(-1).repeat(1,1,1,3).permute(-1,0,1,2).unsqueeze(0).float()
+            pixel_value_video_bg = pixel_value_video_bg.div_(127.5).add_(-1.)
+            if input_video != None:
+                pixel_value_bg = torch.cat([pixel_value_bg, pixel_value_video_bg], dim=2)
+                pixel_value_mask = torch.cat([ pixel_value_mask, pixel_value_video_mask], dim=2)
+            else:
+                pixel_value_bg = pixel_value_video_bg
+                pixel_value_mask = pixel_value_video_mask
+            pixel_value_video_mask, pixel_value_video_bg  = None, None
+        if input_video != None or input_frames != None:
+            if pixel_value_bg.shape[2] < frame_num:
+                padding_shape = list(pixel_value_bg.shape[0:2]) + [frame_num-pixel_value_bg.shape[2]] +  list(pixel_value_bg.shape[3:])  
+                pixel_value_bg = torch.cat([pixel_value_bg, torch.full(padding_shape, -1, dtype=pixel_value_bg.dtype, device= pixel_value_bg.device ) ], dim=2)
+                pixel_value_mask = torch.cat([ pixel_value_mask, torch.full(padding_shape, 255, dtype=pixel_value_mask.dtype, device= pixel_value_mask.device ) ], dim=2)
+
+            bg_latents = self.vae.encode(pixel_value_bg).latent_dist.sample()                
+            pixel_value_mask = pixel_value_mask.div_(127.5).add_(-1.)             
+            mask_latents = self.vae.encode(pixel_value_mask).latent_dist.sample()
+            bg_latents = torch.cat([bg_latents, mask_latents], dim=1)
+            bg_latents.mul_(self.vae.config.scaling_factor)
+
+        if self.avatar:
            if n_prompt == None or len(n_prompt) == 0:
                n_prompt = "Aerial view, aerial view, overexposed, low quality, deformation, a poor composition, bad hands, bad teeth, bad eyes, bad limbs, distortion, blurring, Lens changes"

@ -930,28 +981,33 @@ class HunyuanVideoSampler(Inference):
                # from wan.utils.utils import cache_video
                # cache_video( tensor=image, save_file="decode.mp4", fps=25, nrow=1, normalize=True, value_range=(-1, 1))

+            motion_pose = np.array([25] * 4)
+            motion_exp = np.array([30] * 4)
+            motion_pose = torch.from_numpy(motion_pose).unsqueeze(0)
+            motion_exp = torch.from_numpy(motion_exp).unsqueeze(0)

            face_masks = torch.nn.functional.interpolate(face_masks.float().squeeze(2), 
                                                    (ref_latents.shape[-2], 
                                                    ref_latents.shape[-1]), 
                                                    mode="bilinear").unsqueeze(2).to(dtype=ref_latents.dtype)

+
+        if audio_guide != None:            
            audio_input, audio_len = get_audio_feature(self.feature_extractor, audio_guide, duration = frame_num/fps )
            audio_prompts = audio_input[0]
            weight_dtype = audio_prompts.dtype
-            
-            motion_pose = np.array([25] * 4)
-            motion_exp = np.array([30] * 4)
-            motion_pose = torch.from_numpy(motion_pose).unsqueeze(0)
-            motion_exp = torch.from_numpy(motion_exp).unsqueeze(0)
+            if self.custom:
+                audio_len = min(audio_len, frame_num)
+                audio_input = audio_input[:, :audio_len]
            audio_prompts = encode_audio(self.wav2vec, audio_prompts.to(dtype=self.wav2vec.dtype), fps, num_frames=audio_len) 
            audio_prompts = audio_prompts.to(self.model.dtype)
-            if audio_prompts.shape[1] <= 129:
-                audio_prompts = torch.cat([audio_prompts, torch.zeros_like(audio_prompts[:, :1]).repeat(1,129-audio_prompts.shape[1], 1, 1, 1)], dim=1)
+            segment_size = 129 if self.avatar else frame_num
+            if audio_prompts.shape[1] <= segment_size:
+                audio_prompts = torch.cat([audio_prompts, torch.zeros_like(audio_prompts[:, :1]).repeat(1,segment_size-audio_prompts.shape[1], 1, 1, 1)], dim=1)
            else:
                audio_prompts = torch.cat([audio_prompts, torch.zeros_like(audio_prompts[:, :1]).repeat(1, 5, 1, 1, 1)], dim=1)
            uncond_audio_prompts = torch.zeros_like(audio_prompts[:,:129])
-            # target_frame_num = min(target_frame_num, audio_len)
+
        samples = self.pipeline(
            prompt=input_prompt,
            height=target_height,
@ -976,6 +1032,9 @@ class HunyuanVideoSampler(Inference):
            motion_pose=motion_pose, 
            fps= torch.from_numpy(np.array(fps)), 

+            bg_latents = bg_latents,
+            audio_strength = audio_strength,
+
            denoise_strength=denoise_strength,
            ip_cfg_scale=ip_cfg_scale,             
            freqs_cis=(freqs_cos, freqs_sin),
--- a/hyvideo/modules/models.py
+++ b/hyvideo/modules/models.py
@ -591,7 +591,10 @@ class HYVideoDiffusionTransformer(ModelMixin, ConfigMixin):
        dtype: Optional[torch.dtype] = None,
        device: Optional[torch.device] = None,
        attention_mode: Optional[str] = "sdpa",
+        video_condition: bool = False,
+        audio_condition: bool = False,
        avatar = False,
+        custom = False,
    ):
        factory_kwargs = {"device": device, "dtype": dtype}
        super().__init__()
@ -606,6 +609,10 @@ class HYVideoDiffusionTransformer(ModelMixin, ConfigMixin):
        self.rope_dim_list = rope_dim_list
        self.i2v_condition_type = i2v_condition_type
        self.attention_mode = attention_mode
+        self.video_condition = video_condition
+        self.audio_condition = audio_condition
+        self.avatar = avatar
+        self.custom = custom

        # Text projection. Default to linear projection.
        # Alternative: TokenRefiner. See more details (LI-DiT): http://arxiv.org/abs/2406.11831
@ -710,8 +717,15 @@ class HYVideoDiffusionTransformer(ModelMixin, ConfigMixin):
            get_activation_layer("silu"),
            **factory_kwargs,
        )
-        avatar_audio = avatar
-        if avatar_audio:
+
+        if self.video_condition:
+            self.bg_in = PatchEmbed(
+                self.patch_size, self.in_channels * 2, self.hidden_size, **factory_kwargs
+            )
+            self.bg_proj = nn.Linear(self.hidden_size, self.hidden_size)
+
+        if audio_condition:
+            if avatar:
                self.ref_in = PatchEmbed(
                    self.patch_size, self.in_channels, self.hidden_size, **factory_kwargs
                    )
@ -741,13 +755,18 @@ class HYVideoDiffusionTransformer(ModelMixin, ConfigMixin):

                # -------------------- audio_insert_model --------------------
                self.double_stream_list = [1, 3, 5, 7, 9, 11, 13, 15, 17, 19]
-            self.single_stream_list = []
-            self.double_stream_map = {str(i): j for j, i in enumerate(self.double_stream_list)}
-            self.single_stream_map = {str(i): j+len(self.double_stream_list) for j, i in enumerate(self.single_stream_list)}
+                audio_block_name = "audio_adapter_blocks"
+            elif custom:
+                self.audio_proj = AudioProjNet2(seq_len=10, blocks=5, channels=384, intermediate_dim=1024, output_dim=3072, context_tokens=4)
+                self.double_stream_list = [1, 3, 5, 7, 9, 11]
+                audio_block_name = "audio_models"

-            self.audio_adapter_blocks = nn.ModuleList([
+            self.double_stream_map = {str(i): j for j, i in enumerate(self.double_stream_list)}
+            self.single_stream_list = []
+            self.single_stream_map = {str(i): j+len(self.double_stream_list) for j, i in enumerate(self.single_stream_list)}
+            setattr(self, audio_block_name,  nn.ModuleList([
                PerceiverAttentionCA(dim=3072, dim_head=1024, heads=33) for _ in range(len(self.double_stream_list) + len(self.single_stream_list))
-            ])
+            ]))



@ -798,6 +817,8 @@ class HYVideoDiffusionTransformer(ModelMixin, ConfigMixin):
        motion_pose = None,
        fps = None,
        face_mask = None,
+        audio_strength = None,
+        bg_latents = None,
    ) -> Union[torch.Tensor, Dict[str, torch.Tensor]]:
    
        img = x
@ -854,12 +875,16 @@ class HYVideoDiffusionTransformer(ModelMixin, ConfigMixin):

        # Embed image and text.
        img, shape_mask = self.img_in(img)
-        if audio_prompts != None:
+        if self.avatar:
            ref_latents_first = ref_latents[:, :, :1].clone()
            ref_latents,_ = self.ref_in(ref_latents)
            ref_latents_first,_ = self.img_in(ref_latents_first)
-        elif ref_latents != None:
+        elif self.custom:
+            if ref_latents != None:
                ref_latents, _ = self.img_in(ref_latents)
+            if bg_latents is not None and self.video_condition:
+                bg_latents, _ = self.bg_in(bg_latents)
+                img += self.bg_proj(bg_latents)

        if self.text_projection == "linear":
            txt = self.txt_in(txt)
@ -870,7 +895,7 @@ class HYVideoDiffusionTransformer(ModelMixin, ConfigMixin):
                f"Unsupported text_projection: {self.text_projection}"
            )

-        if audio_prompts != None:
+        if self.avatar:
            img += self.before_proj(ref_latents)
            ref_length = ref_latents_first.shape[-2]          # [b s c]
            img = torch.cat([ref_latents_first, img], dim=-2) # t c
@ -956,7 +981,10 @@ class HYVideoDiffusionTransformer(ModelMixin, ConfigMixin):
                        if audio_adapter != None:
                            real_img = img[i:i+1,ref_length:].view(1, ot, -1, 3072)  
                            real_img = audio_adapter(audio_feature_all_insert[i:i+1], real_img).view(1, -1, 3072)
+                            if face_mask != None:
                                real_img *= face_mask[i:i+1]
+                            if audio_strength != None and audio_strength != 1:
+                                real_img *= audio_strength
                            img[i:i+1, ref_length:] += real_img
                            real_img = None

@ -1095,6 +1123,27 @@ HUNYUAN_VIDEO_CONFIG = {
        "hidden_size": 3072,
        "heads_num": 24,
        "mlp_width_ratio": 4,
+        'custom' : True
+    },
+    'HYVideo-T/2-custom-audio': {                                                                       #   9.0B   / 12.5B
+        "mm_double_blocks_depth": 20,
+        "mm_single_blocks_depth": 40,
+        "rope_dim_list": [16, 56, 56],
+        "hidden_size": 3072,
+        "heads_num": 24,
+        "mlp_width_ratio": 4,
+        'custom' : True,
+        'audio_condition' : True,        
+    },
+    'HYVideo-T/2-custom-edit': {                                                                       #   9.0B   / 12.5B
+        "mm_double_blocks_depth": 20,
+        "mm_single_blocks_depth": 40,
+        "rope_dim_list": [16, 56, 56],
+        "hidden_size": 3072,
+        "heads_num": 24,
+        "mlp_width_ratio": 4,
+        'custom' : True,
+        'video_condition' : True,        
    },
    'HYVideo-T/2-avatar': {                                                                       #   9.0B   / 12.5B
        'mm_double_blocks_depth': 20,
@ -1104,6 +1153,7 @@ HUNYUAN_VIDEO_CONFIG = {
        'heads_num': 24,
        'mlp_width_ratio': 4,
        'avatar': True,
+        'audio_condition' : True,
    },
    
 }
--- a/hyvideo/modules/posemb_layers.py
+++ b/hyvideo/modules/posemb_layers.py
@ -295,7 +295,10 @@ def apply_rotary_emb( qklist,
 def get_nd_rotary_pos_embed_new(rope_dim_list, start, *args, theta=10000., use_real=False, 
                            theta_rescale_factor: Union[float, List[float]]=1.0,
                            interpolation_factor: Union[float, List[float]]=1.0,
-                            concat_dict={}
+                            concat_dict={},
+                            k = 4,
+                            L_test = 66,
+                            enable_riflex = True
                            ):

    grid = get_meshgrid_nd(start, *args, dim=len(rope_dim_list))   # [3, W, H, D] / [2, W, H]
@ -327,9 +330,17 @@ def get_nd_rotary_pos_embed_new(rope_dim_list, start, *args, theta=10000., use_r
    # use 1/ndim of dimensions to encode grid_axis
    embs = []
    for i in range(len(rope_dim_list)):
-        emb = get_1d_rotary_pos_embed(rope_dim_list[i], grid[i].reshape(-1), theta, use_real=use_real,
-                                      theta_rescale_factor=theta_rescale_factor[i],
-                                      interpolation_factor=interpolation_factor[i])    # 2 x [WHD, rope_dim_list[i]]
+        # === RIFLEx modification start ===
+        # apply RIFLEx for time dimension
+        if i == 0 and enable_riflex:
+            emb = get_1d_rotary_pos_embed_riflex(rope_dim_list[i], grid[i].reshape(-1), theta, use_real=True, k=k, L_test=L_test)
+        # === RIFLEx modification end ===
+        else:
+            emb = get_1d_rotary_pos_embed(rope_dim_list[i], grid[i].reshape(-1), theta, use_real=True, theta_rescale_factor=theta_rescale_factor[i],interpolation_factor=interpolation_factor[i],)
+
+        # emb = get_1d_rotary_pos_embed(rope_dim_list[i], grid[i].reshape(-1), theta, use_real=use_real,
+        #                               theta_rescale_factor=theta_rescale_factor[i],
+        #                        w       interpolation_factor=interpolation_factor[i])    # 2 x [WHD, rope_dim_list[i]]
        
        embs.append(emb)

--- a/preprocessing/matanyone/app.py
+++ b/preprocessing/matanyone/app.py
@ -517,11 +517,15 @@ def export_image(image_refs, image_output):
    image_refs.append( image_output)
    return image_refs

-def export_to_current_video_engine(foreground_video_output, alpha_video_output):
+def export_to_current_video_engine(model_type, foreground_video_output, alpha_video_output):
    gr.Info("Masked Video Input and Full Mask transferred to Current Video Engine For Inpainting")
    # return "MV#" + str(time.time()), foreground_video_output, alpha_video_output
+    if "custom_edit" in model_type:
+        return gr.update(), alpha_video_output
+    else:
        return foreground_video_output, alpha_video_output

+
 def teleport_to_video_tab():
    return gr.Tabs(selected="video_gen")

@ -675,7 +679,7 @@ def display(tabs, model_choice, vace_video_input, vace_video_mask, vace_image_re
                export_to_vace_video_14B_btn.click( fn=teleport_to_vace_14B, inputs=[], outputs=[tabs, model_choice]).then(
                    fn=export_to_current_video_engine, inputs= [foreground_video_output, alpha_video_output], outputs= [video_prompt_video_guide_trigger, vace_video_input, vace_video_mask])
                
-                export_to_current_video_engine_btn.click(  fn=export_to_current_video_engine, inputs= [foreground_video_output, alpha_video_output], outputs= [vace_video_input, vace_video_mask]).then( #video_prompt_video_guide_trigger, 
+                export_to_current_video_engine_btn.click(  fn=export_to_current_video_engine, inputs= [model_choice, foreground_video_output, alpha_video_output], outputs= [vace_video_input, vace_video_mask]).then( #video_prompt_video_guide_trigger, 
                    fn=teleport_to_video_tab, inputs= [], outputs= [tabs])


--- a/requirements.txt
+++ b/requirements.txt
@ -33,4 +33,5 @@ librosa
 loguru
 sentencepiece
 av
+opencv-python
 # rembg==2.0.65
--- a/wan/utils/utils.py
+++ b/wan/utils/utils.py
@ -46,7 +46,8 @@ def resample(video_fps, video_frames_count, max_target_frames_count, target_fps,
    while True:
        if max_target_frames_count != 0 and len(frame_ids) >= max_target_frames_count :
            break
-        add_frames_count = math.ceil( (target_time -cur_time) / video_frame_duration )
+        diff = round( (target_time -cur_time) / video_frame_duration , 5)
+        add_frames_count = math.ceil( diff)
        frame_no += add_frames_count
        if frame_no >= video_frames_count:             
            break
--- a/wgp.py
+++ b/wgp.py
@ -33,6 +33,7 @@ import tempfile
 import atexit
 import shutil
 import glob
+import cv2
 from transformers.utils import logging
 logging.set_verbosity_error

@ -43,7 +44,7 @@ AUTOSAVE_FILENAME = "queue.zip"
 PROMPT_VARS_MAX = 10

 target_mmgp_version = "3.4.8"
-WanGP_version = "5.41"
+WanGP_version = "5.5"
 prompt_enhancer_image_caption_model, prompt_enhancer_image_caption_processor, prompt_enhancer_llm_model, prompt_enhancer_llm_tokenizer = None, None, None, None

 from importlib.metadata import version
@ -184,12 +185,18 @@ def process_prompt_and_add_tasks(state, model_choice):
        gr.Info("You must use the 14B model to generate videos with a resolution equivalent to 720P")
        return

-    if "diffusion_forcing" in model_filename or "ltxv" in model_filename or "Vace" in model_filename:
+    if "diffusion_forcing" in model_filename or "ltxv" in model_filename or "Vace" in model_filename or "hunyuan_video_custom_edit" in model_filename:
        video_length = inputs["video_length"]
        sliding_window_size = inputs["sliding_window_size"]
        if  video_length > sliding_window_size:
            gr.Info(f"The Number of Frames to generate ({video_length}) is greater than the Sliding Window Size ({sliding_window_size}) , multiple Windows will be generated")

+    if "hunyuan_video_custom_edit" in model_filename:
+        keep_frames_video_guide= inputs["keep_frames_video_guide"] 
+        if len(keep_frames_video_guide) > 0: 
+            gr.Info("Filtering Frames with this model is not supported")
+            return
+
    if "phantom" in model_filename or "hunyuan_video_custom" in model_filename or "hunyuan_video_avatar" in model_filename:
        image_refs = inputs["image_refs"]
        audio_guide  = inputs["audio_guide"]
@ -1552,6 +1559,8 @@ ltxv_choices= ["ckpts/ltxv_0.9.7_13B_dev_bf16.safetensors", "ckpts/ltxv_0.9.7_13

 hunyuan_choices= ["ckpts/hunyuan_video_720_bf16.safetensors", "ckpts/hunyuan_video_720_quanto_int8.safetensors", "ckpts/hunyuan_video_i2v_720_bf16v2.safetensors", "ckpts/hunyuan_video_i2v_720_quanto_int8v2.safetensors",
                 "ckpts/hunyuan_video_custom_720_bf16.safetensors", "ckpts/hunyuan_video_custom_720_quanto_bf16_int8.safetensors",
+                 "ckpts/hunyuan_video_custom_audio_720_bf16.safetensors", "ckpts/hunyuan_video_custom_audio_720_quanto_bf16_int8.safetensors",
+                 "ckpts/hunyuan_video_custom_edit_720_bf16.safetensors", "ckpts/hunyuan_video_custom_edit_720_quanto_bf16_int8.safetensors",
                 "ckpts/hunyuan_video_avatar_720_bf16.safetensors", "ckpts/hunyuan_video_avatar_720_quanto_bf16_int8.safetensors",
                 ]

@ -1563,13 +1572,14 @@ def get_dependent_models(model_filename, quantization, dtype_policy ):
        return [get_model_filename("ltxv_13B", quantization, dtype_policy)]
    else:
        return []
-model_types = [ "t2v_1.3B", "t2v", "i2v", "i2v_720p", "flf2v_720p", "vace_1.3B","vace_14B","moviigen", "phantom_1.3B", "phantom_14B", "fantasy",  "fun_inp_1.3B", "fun_inp", "recam_1.3B",  "sky_df_1.3B", "sky_df_14B", "sky_df_720p_14B", "ltxv_13B", "ltxv_13B_distilled", "hunyuan", "hunyuan_i2v", "hunyuan_custom", "hunyuan_avatar"]
+model_types = [ "t2v_1.3B", "t2v", "i2v", "i2v_720p", "flf2v_720p", "vace_1.3B","vace_14B","moviigen", "phantom_1.3B", "phantom_14B", "fantasy",  "fun_inp_1.3B", "fun_inp", "recam_1.3B",  "sky_df_1.3B", "sky_df_14B", "sky_df_720p_14B", "ltxv_13B", "ltxv_13B_distilled", "hunyuan", "hunyuan_i2v", "hunyuan_custom", "hunyuan_custom_audio", "hunyuan_custom_edit", "hunyuan_avatar"]
 model_signatures = {"t2v": "text2video_14B", "t2v_1.3B" : "text2video_1.3B",   "fun_inp_1.3B" : "Fun_InP_1.3B",  "fun_inp" :  "Fun_InP_14B", 
                    "i2v" : "image2video_480p", "i2v_720p" : "image2video_720p" , "vace_1.3B" : "Vace_1.3B", "vace_14B" : "Vace_14B","recam_1.3B": "recammaster_1.3B", 
                    "flf2v_720p" : "FLF2V_720p", "sky_df_1.3B" : "sky_reels2_diffusion_forcing_1.3B", "sky_df_14B" : "sky_reels2_diffusion_forcing_14B", 
                    "sky_df_720p_14B" : "sky_reels2_diffusion_forcing_720p_14B",  "moviigen" :"moviigen",
                    "phantom_1.3B" : "phantom_1.3B", "phantom_14B" : "phantom_14B", "fantasy" : "fantasy", "ltxv_13B" : "ltxv_0.9.7_13B_dev", "ltxv_13B_distilled" : "ltxv_0.9.7_13B_distilled", 
-                    "hunyuan" : "hunyuan_video_720", "hunyuan_i2v" : "hunyuan_video_i2v_720", "hunyuan_custom" : "hunyuan_video_custom", "hunyuan_avatar" : "hunyuan_video_avatar"  }
+                    "hunyuan" : "hunyuan_video_720", "hunyuan_i2v" : "hunyuan_video_i2v_720", "hunyuan_custom" : "hunyuan_video_custom_720", "hunyuan_custom_audio" : "hunyuan_video_custom_audio", "hunyuan_custom_edit" : "hunyuan_video_custom_edit",
+                    "hunyuan_avatar" : "hunyuan_video_avatar"  }


 def get_model_type(model_filename):
@ -1653,6 +1663,13 @@ def get_model_name(model_filename, description_container = [""]):
        model_name = "Hunyuan Video image2video 720p 13B"
        description = "A good looking image 2 video model, but not so good in prompt adherence."
    elif "hunyuan_video_custom" in model_filename:
+        if "audio" in model_filename:
+            model_name = "Hunyuan Video Custom Audio 720p 13B"
+            description = "The Hunyuan Video Custom Audio model can be used to generate scenes of a person speaking given a Reference Image and a Recorded Voice or Song. The reference image is not a start image and therefore one can represent the person in a different context.The video length can be anything up to 10s. It is also quite good to generate no sound Video based on a person."
+        elif "edit" in model_filename:
+            model_name = "Hunyuan Video Custom Edit 720p 13B"
+            description = "The Hunyuan Video Custom Edit model can be used to do Video inpainting on a person (add accessories or completely replace the person). You will need in any case to define a Video Mask which will indicate which area of the Video should be edited."
+        else:
            model_name = "Hunyuan Video Custom 720p 13B"
            description = "The Hunyuan Video Custom model is probably the best model to transfer people (only people for the momment) as it is quite good to keep their identity. However it is slow as to get good results, you need to generate 720p videos with 30 steps."
    elif "hunyuan_video_avatar" in model_filename:
@ -1778,6 +1795,13 @@ def get_default_settings(filename):
                "flow_shift": 13,
                "resolution": "1280x720",
            })
+        elif get_model_type(filename) in ("hunyuan_custom_edit"):
+            ui_defaults.update({
+                "guidance_scale": 7.5,
+                "flow_shift": 13,
+                "video_prompt_type": "MV",
+                "sliding_window_size": 129,
+            })
        elif get_model_type(filename) in ("hunyuan_avatar"):
            ui_defaults.update({
                "guidance_scale": 7.5,
@ -2570,19 +2594,96 @@ def convert_image(image):
    image = image.convert('RGB')
    return cast(Image, ImageOps.exif_transpose(image))

-def get_resampled_video(video_in, start_frame, max_frames, target_fps):
+def get_resampled_video(video_in, start_frame, max_frames, target_fps, bridge='torch'):
    from wan.utils.utils import resample

    import decord
-    decord.bridge.set_bridge('torch')
+    decord.bridge.set_bridge(bridge)
    reader = decord.VideoReader(video_in)
    
-    fps = reader.get_avg_fps()
+    fps = round(reader.get_avg_fps())

    frame_nos = resample(fps, len(reader), max_target_frames_count= max_frames, target_fps=target_fps, start_target_frame= start_frame)
    frames_list = reader.get_batch(frame_nos)
+    # print(f"frame nos: {frame_nos}")
    return frames_list

+def preprocess_video_with_mask(input_video_path, input_mask_path, height, width,  max_frames, start_frame=0, fit_canvas = False, target_fps = 16, block_size= 16, expand_scale = 2, pose_enhance = True, to_bbox = False):
+    if not input_video_path or not input_mask_path:
+        return None, None
+
+    from preprocessing.dwpose.pose import PoseBodyFaceVideoAnnotator
+    cfg_dict = {
+        "DETECTION_MODEL": "ckpts/pose/yolox_l.onnx",
+        "POSE_MODEL": "ckpts/pose/dw-ll_ucoco_384.onnx",
+        "RESIZE_SIZE": 1024
+    }
+    dwpose = PoseBodyFaceVideoAnnotator(cfg_dict)
+
+    video = get_resampled_video(input_video_path, start_frame, max_frames, target_fps)
+    mask_video = get_resampled_video(input_mask_path, start_frame, max_frames, target_fps)
+
+    if len(video) == 0 or len(mask_video) == 0:
+        return None, None
+
+    frame_height, frame_width, _ = video[0].shape
+
+    if fit_canvas :
+        scale1  = min(height / frame_height, width /  frame_width)
+        scale2  = min(height / frame_width, width /  frame_height)
+        scale = max(scale1, scale2)
+    else:
+        scale =   ((height * width ) /  (frame_height * frame_width))**(1/2)
+
+    height = (int(frame_height * scale) // block_size) * block_size
+    width = (int(frame_width * scale) // block_size) * block_size
+
+    num_frames = min(len(video), len(mask_video))
+
+    masked_frames = []
+    masks = []
+    for frame_idx in range(num_frames):
+        frame = Image.fromarray(video[frame_idx].cpu().numpy()) #.asnumpy()
+        mask = Image.fromarray(mask_video[frame_idx].cpu().numpy()) #.asnumpy()
+        frame = frame.resize((width, height), resample=Image.Resampling.LANCZOS) 
+        mask = mask.resize((width, height), resample=Image.Resampling.LANCZOS) 
+        frame = np.array(frame) 
+        mask = np.array(mask)
+
+        if len(mask.shape) == 3 and mask.shape[2] == 3:
+            mask = cv2.cvtColor(mask, cv2.COLOR_BGR2GRAY)
+
+        if expand_scale != 0:
+            kernel_size = abs(expand_scale)
+            kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (kernel_size, kernel_size))
+            op_expand = cv2.dilate if expand_scale > 0 else cv2.erode
+            mask = op_expand(mask, kernel, iterations=3)
+
+        _, mask = cv2.threshold(mask, 127.5, 255, cv2.THRESH_BINARY)
+        if to_bbox and np.sum(mask == 255) > 0:
+            x0, y0, x1, y1 = mask_to_xyxy_box(mask)
+            mask = mask * 0
+            mask[y0:y1, x0:x1] = 255
+
+        inverse_mask = mask == 0
+        if pose_enhance:
+            pose_img = dwpose.forward([frame])[0]
+            masked_frame = np.where(inverse_mask[..., None], frame, pose_img) 
+        else:
+            masked_frame = frame * (inverse_mask[..., None].astype(frame.dtype))
+
+        mask = torch.from_numpy(mask) # to be commented if save one video enabled
+        masked_frame = torch.from_numpy(masked_frame) # to be commented if save one video debug enabled
+        masks.append(mask)
+        masked_frames.append(masked_frame)
+
+
+    # from preprocessing.dwpose.pose import save_one_video
+    # save_one_video("masked_frames.mp4", masked_frames, fps=target_fps, quality=8, macro_block_size=None)
+    # save_one_video("masks.mp4", masks, fps=target_fps, quality=8, macro_block_size=None)
+
+    return torch.stack(masked_frames), torch.stack(masks)
+
 def preprocess_video(process_type, height, width, video_in, max_frames, start_frame=0, fit_canvas = False, target_fps = 16, block_size = 16):

    frames_list = get_resampled_video(video_in, start_frame, max_frames, target_fps)
@ -2600,9 +2701,6 @@ def preprocess_video(process_type, height, width, video_in, max_frames, start_fr

    new_height = (int(frame_height * scale) // block_size) * block_size
    new_width = (int(frame_width * scale) // block_size) * block_size
-    # if fit_canvas :
-    #     new_height = height
-    #     new_width = width

    processed_frames_list = []
    for frame in frames_list:
@ -2857,12 +2955,14 @@ def generate_video(
    hunyuan_t2v = "hunyuan_video_720" in model_filename
    hunyuan_i2v = "hunyuan_video_i2v" in model_filename
    hunyuan_custom = "hunyuan_video_custom" in model_filename
+    hunyuan_custom_audio =  hunyuan_custom and "audio" in model_filename
+    hunyuan_custom_edit =  hunyuan_custom and "edit" in model_filename
    hunyuan_avatar = "hunyuan_video_avatar" in model_filename
    fantasy = "fantasy" in model_filename
-    if diffusion_forcing or hunyuan_t2v or hunyuan_i2v or hunyuan_custom:
-        fps = 24
-    elif hunyuan_avatar:
+    if hunyuan_avatar or hunyuan_custom_audio:
        fps = 25
+    elif diffusion_forcing or hunyuan_t2v or hunyuan_i2v or hunyuan_custom:
+        fps = 24
    elif fantasy:
        fps = 23
    elif ltxv:
@ -2913,7 +3013,7 @@ def generate_video(
    audio_proj_split = None
    audio_scale = None
    audio_context_lens = None
-    if (fantasy or hunyuan_avatar) and audio_guide != None:
+    if (fantasy or hunyuan_avatar or hunyuan_custom_audio) and audio_guide != None:
        from fantasytalking.infer import parse_audio
        import librosa
        duration = librosa.get_duration(path=audio_guide)
@ -2922,6 +3022,12 @@ def generate_video(
            audio_proj_split, audio_context_lens = parse_audio(audio_guide, num_frames= current_video_length, fps= fps, device= processing_device  )
            audio_scale = 1.0

+    if hunyuan_custom_edit and video_guide != None:
+        import cv2
+        cap = cv2.VideoCapture(video_guide)
+        length = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+        current_video_length = min(current_video_length, length)
+
    import random
    if seed == None or seed <0:
        seed = random.randint(0, 999999999)
@ -2938,13 +3044,10 @@ def generate_video(
    repeat_no = 0
    extra_generation = 0
    initial_total_windows = 0
-    if diffusion_forcing or vace or ltxv:
-        reuse_frames = min(sliding_window_size - 4, sliding_window_overlap)
-    else:
-        reuse_frames = 0
    if (diffusion_forcing or ltxv) and source_video != None:
        current_video_length +=  sliding_window_overlap
-    sliding_window = (vace or diffusion_forcing or ltxv) and current_video_length > sliding_window_size
+    sliding_window = (vace or diffusion_forcing or ltxv or hunyuan_custom_edit) and current_video_length > sliding_window_size
+    reuse_frames = min(sliding_window_size - 4, sliding_window_overlap) if sliding_window else 0

    discard_last_frames = sliding_window_discard_last_frames
    default_max_frames_to_generate = current_video_length
@ -3094,6 +3197,10 @@ def generate_video(
                                                                        pre_src_video = [pre_video_guide],
                                                                        fit_into_canvas = fit_canvas 
                                                                        )
+            elif hunyuan_custom_edit:
+                progress_args = [0, get_latest_status(state,"Extracting Open Pose Information and Expanding Mask")]
+                send_cmd("progress", progress_args)
+                src_video, src_mask = preprocess_video_with_mask(video_guide,  video_mask, height=height, width = width, max_frames= current_video_length if window_no == 1 else current_video_length - reuse_frames, start_frame = guide_start_frame, fit_canvas = fit_canvas, target_fps = fps, pose_enhance = "P" in  video_prompt_type)
            if window_no ==  1:                
                conditioning_latents_size = ( (prefix_video_frames_count-1) // latent_size) + 1 if prefix_video_frames_count > 0 else 0
            else:
@ -3124,7 +3231,7 @@ def generate_video(
                    input_frames = src_video,
                    input_ref_images=  src_ref_images,
                    input_masks = src_mask,
-                    input_video= pre_video_guide  if diffusion_forcing or ltxv else source_video,
+                    input_video= pre_video_guide  if diffusion_forcing or ltxv or hunyuan_custom_edit else source_video,
                    target_camera= target_camera,
                    frame_num=(current_video_length // latent_size)* latent_size + 1,
                    height =  height,
@ -3693,12 +3800,12 @@ def process_tasks(state):

 def get_generation_status(prompt_no, prompts_max, repeat_no, repeat_max, window_no, total_windows):
    if prompts_max == 1:        
-        if repeat_max == 1:
+        if repeat_max <= 1:
            status = ""
        else:
            status = f"Sample {repeat_no}/{repeat_max}"
    else:
-        if repeat_max == 1:
+        if repeat_max <= 1:
            status = f"Prompt {prompt_no}/{prompts_max}"
        else:
            status = f"Prompt {prompt_no}/{prompts_max}, Sample {repeat_no}/{repeat_max}"
@ -4111,7 +4218,7 @@ def prepare_inputs_dict(target, inputs ):
            inputs.pop(k)


-    if not "Vace" in model_filename and not "diffusion_forcing" in model_filename and not "ltxv" in model_filename:
+    if not "Vace" in model_filename and not "diffusion_forcing" in model_filename and not "ltxv" in model_filename and not "hunyuan_custom_edit" in model_filename:
        unsaved_params = [ "sliding_window_size", "sliding_window_overlap", "sliding_window_overlap_noise", "sliding_window_discard_last_frames"]
        for k in unsaved_params:
            inputs.pop(k)
@ -4556,8 +4663,10 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non
            hunyuan_t2v = "hunyuan_video_720" in model_filename
            hunyuan_i2v = "hunyuan_video_i2v" in model_filename
            hunyuan_video_custom = "hunyuan_video_custom" in model_filename
+            hunyuan_video_custom_audio = hunyuan_video_custom  and "audio" in model_filename
+            hunyuan_video_custom_edit = hunyuan_video_custom  and "edit" in model_filename
            hunyuan_video_avatar = "hunyuan_video_avatar" in model_filename
-            sliding_window_enabled = vace or diffusion_forcing or ltxv
+            sliding_window_enabled = vace or diffusion_forcing or ltxv or hunyuan_video_custom_edit
            new_line_text = "each new line of prompt will be used for a window" if sliding_window_enabled else "each new line of prompt will generate a new video"

            with gr.Column(visible= test_class_i2v(model_filename) or diffusion_forcing or ltxv or recammaster) as image_prompt_column: 
@ -4630,7 +4739,7 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non
                    model_mode = gr.Dropdown(value=None, visible=False)
                    keep_frames_video_source = gr.Text(visible=False)

-            with gr.Column(visible= vace or phantom or hunyuan_video_custom or hunyuan_video_avatar) as video_prompt_column: 
+            with gr.Column(visible= vace or phantom or hunyuan_video_custom or hunyuan_video_avatar or hunyuan_video_custom_edit ) as video_prompt_column: 
                video_prompt_type_value= ui_defaults.get("video_prompt_type","")
                video_prompt_type = gr.Text(value= video_prompt_type_value, visible= False)
                with gr.Row():
@ -4648,6 +4757,15 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non
                            value=filter_letters(video_prompt_type_value, "ODPCMV"),
                            label="Video to Video", scale = 3, visible= True
                        )
+                    elif hunyuan_video_custom_edit:
+                        video_prompt_type_video_guide = gr.Dropdown(
+                            choices=[
+                                ("Inpaint Control Video in area defined by Mask", "MV"),
+                                ("Inpaint and Transfer Human Motion from the Control Video in area defined by Mask", "PMV"),
+                            ],
+                            value=filter_letters(video_prompt_type_value, "ODPCMV"),
+                            label="Video to Video", scale = 3, visible= True
+                        )
                    else:
                        video_prompt_type_video_guide = gr.Dropdown(visible= False)

@ -4686,7 +4804,7 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non


                video_mask = gr.Video(label= "Video Mask (for Inpainting or Outpaing, white pixels = Mask)", visible= "M" in video_prompt_type_value, value= ui_defaults.get("video_mask", None)) 
-            audio_guide = gr.Audio(value= ui_defaults.get("audio_guide", None), type="filepath", label="Voice to follow", show_download_button= True, visible= fantasy or hunyuan_video_avatar )
+            audio_guide = gr.Audio(value= ui_defaults.get("audio_guide", None), type="filepath", label="Voice to follow", show_download_button= True, visible= fantasy or hunyuan_video_avatar or hunyuan_video_custom_audio   )

            advanced_prompt = advanced_ui
            prompt_vars=[]
@ -4775,9 +4893,9 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non
                    video_length = gr.Slider(17, 737, value=ui_defaults.get("video_length", 81), step=4, label="Number of frames (16 = 1s)", interactive= True)
                elif fantasy:
                    video_length = gr.Slider(5, 233, value=ui_defaults.get("video_length", 81), step=4, label="Number of frames (23 = 1s)", interactive= True)
-                elif hunyuan_video_avatar:
+                elif hunyuan_video_avatar or hunyuan_video_custom_audio:
                    video_length = gr.Slider(5, 401, value=ui_defaults.get("video_length", 81), step=4, label="Number of frames (25 = 1s)", interactive= True)
-                elif hunyuan_t2v or hunyuan_i2v:
+                elif hunyuan_t2v or hunyuan_i2v or hunyuan_video_custom:
                    video_length = gr.Slider(5, 337, value=ui_defaults.get("video_length", 97), step=4, label="Number of frames (24 = 1s)", interactive= True)
                else:
                    video_length = gr.Slider(5, 193, value=ui_defaults.get("video_length", 81), step=4, label="Number of frames (16 = 1s)", interactive= True)
@ -4915,17 +5033,22 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non
                        if diffusion_forcing:
                            sliding_window_size = gr.Slider(37, 137, value=ui_defaults.get("sliding_window_size", 97), step=20, label="Sliding Window Size (recommended to keep it at 97)")
                            sliding_window_overlap = gr.Slider(17, 97, value=ui_defaults.get("sliding_window_overlap",17), step=20, label="Windows Frames Overlap (needed to maintain continuity between windows, a higher value will require more windows)")
-                            sliding_window_overlap_noise = gr.Slider(0, 100, value=ui_defaults.get("sliding_window_overlap_noise",20), step=1, label="Noise to be added to overlapped frames to reduce blur effect")
+                            sliding_window_overlap_noise = gr.Slider(0, 100, value=ui_defaults.get("sliding_window_overlap_noise",20), step=1, label="Noise to be added to overlapped frames to reduce blur effect", visible = True)
                            sliding_window_discard_last_frames = gr.Slider(0, 20, value=ui_defaults.get("sliding_window_discard_last_frames", 0), step=4, visible = False)
                        elif ltxv:
                            sliding_window_size = gr.Slider(41, 257, value=ui_defaults.get("sliding_window_size", 129), step=8, label="Sliding Window Size")
                            sliding_window_overlap = gr.Slider(9, 97, value=ui_defaults.get("sliding_window_overlap",9), step=8, label="Windows Frames Overlap (needed to maintain continuity between windows, a higher value will require more windows)")
-                            sliding_window_overlap_noise = gr.Slider(0, 100, value=ui_defaults.get("sliding_window_overlap_noise",20), step=1, label="Noise to be added to overlapped frames to reduce blur effect")
+                            sliding_window_overlap_noise = gr.Slider(0, 100, value=ui_defaults.get("sliding_window_overlap_noise",20), step=1, label="Noise to be added to overlapped frames to reduce blur effect", visible = False)
                            sliding_window_discard_last_frames = gr.Slider(0, 20, value=ui_defaults.get("sliding_window_discard_last_frames", 0), step=4, visible = False)
+                        elif hunyuan_video_custom_edit:
+                            sliding_window_size = gr.Slider(5, 257, value=ui_defaults.get("sliding_window_size", 129), step=4, label="Sliding Window Size")
+                            sliding_window_overlap = gr.Slider(1, 97, value=ui_defaults.get("sliding_window_overlap",5), step=4, label="Windows Frames Overlap (needed to maintain continuity between windows, a higher value will require more windows)")
+                            sliding_window_overlap_noise = gr.Slider(0, 150, value=ui_defaults.get("sliding_window_overlap_noise",20), step=1, label="Noise to be added to overlapped frames to reduce blur effect", visible = False)
+                            sliding_window_discard_last_frames = gr.Slider(0, 20, value=ui_defaults.get("sliding_window_discard_last_frames", 0), step=4, label="Discard Last Frames of a Window (that may have bad quality)", visible = True)
                        else:
                            sliding_window_size = gr.Slider(5, 137, value=ui_defaults.get("sliding_window_size", 81), step=4, label="Sliding Window Size")
                            sliding_window_overlap = gr.Slider(1, 97, value=ui_defaults.get("sliding_window_overlap",5), step=4, label="Windows Frames Overlap (needed to maintain continuity between windows, a higher value will require more windows)")
-                            sliding_window_overlap_noise = gr.Slider(0, 150, value=ui_defaults.get("sliding_window_overlap_noise",20), step=1, label="Noise to be added to overlapped frames to reduce blur effect")
+                            sliding_window_overlap_noise = gr.Slider(0, 150, value=ui_defaults.get("sliding_window_overlap_noise",20), step=1, label="Noise to be added to overlapped frames to reduce blur effect" , visible = True)
                            sliding_window_discard_last_frames = gr.Slider(0, 20, value=ui_defaults.get("sliding_window_discard_last_frames", 8), step=4, label="Discard Last Frames of a Window (that may have bad quality)", visible = True)