From 90b52d86ffdb8a277c5d3e9152e60b4a756e3059 Mon Sep 17 00:00:00 2001 From: DeepBeepMeep Date: Sat, 17 May 2025 16:51:20 +0200 Subject: [PATCH] more fixes --- hyvideo/text_encoder/__init__.py | 4 ++-- ltx_video/ltxv.py | 16 +++++++++++----- ltx_video/pipelines/crf_compressor.py | 2 +- requirements.txt | 1 + wan/text2video.py | 6 +++--- wgp.py | 14 ++++++++------ 6 files changed, 26 insertions(+), 17 deletions(-) diff --git a/hyvideo/text_encoder/__init__.py b/hyvideo/text_encoder/__init__.py index 46af6e3..a51aa3f 100644 --- a/hyvideo/text_encoder/__init__.py +++ b/hyvideo/text_encoder/__init__.py @@ -189,9 +189,9 @@ class TextEncoder(nn.Module): if "llm" in text_encoder_type: from mmgp import offload forcedConfigPath= None if "i2v" in text_encoder_type else "ckpts/llava-llama-3-8b/config.json" - self.model= offload.fast_load_transformers_model(self.model_path, forcedConfigPath=forcedConfigPath, modelPrefix= "model" if forcedConfigPath !=None else None) + self.model= offload.fast_load_transformers_model(self.model_path, forcedConfigPath=forcedConfigPath) if forcedConfigPath != None: - self.model.final_layer_norm = self.model.norm + self.model.final_layer_norm = self.model.model.norm else: self.model, self.model_path = load_text_encoder( diff --git a/ltx_video/ltxv.py b/ltx_video/ltxv.py index e82c004..aac1d31 100644 --- a/ltx_video/ltxv.py +++ b/ltx_video/ltxv.py @@ -155,14 +155,14 @@ class LTXV: ): self.mixed_precision_transformer = mixed_precision_transformer - # ckpt_path = Path(ckpt_path) + self.distilled = "distilled" in model_filepath[0] # with safe_open(ckpt_path, framework="pt") as f: # metadata = f.metadata() # config_str = metadata.get("config") # configs = json.loads(config_str) # allowed_inference_steps = configs.get("allowed_inference_steps", None) # transformer = Transformer3DModel.from_pretrained(ckpt_path) - # offload.save_model(transformer, "ckpts/ltxv_0.9.7_13B_dev_bf16.safetensors", config_file_path="config_transformer.json") + # transformer = offload.fast_load_transformers_model("c:/temp/ltxdistilled/diffusion_pytorch_model-00001-of-00006.safetensors", forcedConfigPath="c:/temp/ltxdistilled/config.json") # vae = CausalVideoAutoencoder.from_pretrained(ckpt_path) vae = offload.fast_load_transformers_model("ckpts/ltxv_0.9.7_VAE.safetensors", modelClass=CausalVideoAutoencoder) @@ -174,8 +174,11 @@ class LTXV: # vae = offload.fast_load_transformers_model("vae.safetensors", modelClass=CausalVideoAutoencoder, modelPrefix= "vae", forcedConfigPath="config_vae.json") # offload.save_model(vae, "vae.safetensors", config_file_path="config_vae.json") - - transformer = offload.fast_load_transformers_model(model_filepath, modelClass=Transformer3DModel) + # model_filepath = "c:/temp/ltxd/ltxv-13b-0.9.7-distilled.safetensors" + transformer = offload.fast_load_transformers_model(model_filepath, modelClass=Transformer3DModel, forcedConfigPath= "c:/temp/ltxd/config.json") + # offload.save_model(transformer, "ckpts/ltxv_0.9.7_13B_distilled_bf16.safetensors", config_file_path= "c:/temp/ltxd/config.json") + # offload.save_model(transformer, "ckpts/ltxv_0.9.7_13B_distilled_quanto_bf16_int8.safetensors", do_quantize= True, config_file_path="c:/temp/ltxd/config.json") + # transformer = offload.fast_load_transformers_model(model_filepath, modelClass=Transformer3DModel) transformer._model_dtype = dtype if mixed_precision_transformer: transformer._lock_dtype = torch.float @@ -295,7 +298,10 @@ class LTXV: conditioning_media_paths = None conditioning_start_frames = None - pipeline_config = "ltx_video/configs/ltxv-13b-0.9.7-dev.yaml" + if self.distilled : + pipeline_config = "ltx_video/configs/ltxv-13b-0.9.7-distilled.yaml" + else: + pipeline_config = "ltx_video/configs/ltxv-13b-0.9.7-dev.yaml" # check if pipeline_config is a file if not os.path.isfile(pipeline_config): raise ValueError(f"Pipeline config file {pipeline_config} does not exist") diff --git a/ltx_video/pipelines/crf_compressor.py b/ltx_video/pipelines/crf_compressor.py index 416c3f7..9b9380a 100644 --- a/ltx_video/pipelines/crf_compressor.py +++ b/ltx_video/pipelines/crf_compressor.py @@ -1,4 +1,4 @@ -# import av +import av import torch import io import numpy as np diff --git a/requirements.txt b/requirements.txt index deeaa58..faf0745 100644 --- a/requirements.txt +++ b/requirements.txt @@ -32,4 +32,5 @@ hydra-core librosa loguru sentencepiece +av # rembg==2.0.65 diff --git a/wan/text2video.py b/wan/text2video.py index b2c30aa..9f7564f 100644 --- a/wan/text2video.py +++ b/wan/text2video.py @@ -80,9 +80,9 @@ class WanT2V: logging.info(f"Creating WanModel from {model_filename[-1]}") from mmgp import offload - # model_filename = "c:/temp/vace/diffusion_pytorch_model-00001-of-00007.safetensors" + # model_filename = "c:/temp/vace1.3/diffusion_pytorch_model.safetensors" # model_filename = "vace14B_quanto_bf16_int8.safetensors" - self.model = offload.fast_load_transformers_model(model_filename, modelClass=WanModel,do_quantize= quantizeTransformer, writable_tensors= False) # , forcedConfigPath= "c:/temp/vace/vace_config.json") + self.model = offload.fast_load_transformers_model(model_filename, modelClass=WanModel,do_quantize= quantizeTransformer, writable_tensors= False , forcedConfigPath= "c:/temp/vace1.3/config.json") # offload.load_model_data(self.model, "e:/vace.safetensors") # offload.load_model_data(self.model, "c:/temp/Phantom-Wan-1.3B.pth") # self.model.to(torch.bfloat16) @@ -90,7 +90,7 @@ class WanT2V: self.model.lock_layers_dtypes(torch.float32 if mixed_precision_transformer else dtype) # dtype = torch.bfloat16 offload.change_dtype(self.model, dtype, True) - # offload.save_model(self.model, "vace14B_bf16.safetensors", config_file_path="c:/temp/vace/vace_config.json") + # offload.save_model(self.model, "wan2.1_Vace1.3B_mbf16.safetensors", config_file_path="c:/temp/vace1.3/config.json") # offload.save_model(self.model, "vace14B_quanto_fp16_int8.safetensors", do_quantize= True, config_file_path="c:/temp/vace/vace_config.json") self.model.eval().requires_grad_(False) diff --git a/wgp.py b/wgp.py index 81abdd5..93f26b8 100644 --- a/wgp.py +++ b/wgp.py @@ -1528,7 +1528,7 @@ wan_choices_i2v=["ckpts/wan2.1_image2video_480p_14B_mbf16.safetensors", "ckpts/w "ckpts/wan2.1_image2video_720p_14B_quanto_mbf16_int8.safetensors", "ckpts/wan2.1_Fun_InP_1.3B_bf16.safetensors", "ckpts/wan2.1_Fun_InP_14B_bf16.safetensors", "ckpts/wan2.1_Fun_InP_14B_quanto_int8.safetensors", "ckpts/wan2.1_FLF2V_720p_14B_bf16.safetensors", "ckpts/wan2.1_FLF2V_720p_14B_quanto_int8.safetensors", "ckpts/wan2.1_fantasy_speaking_14B_bf16.safetensors"] -ltxv_choices= ["ckpts/ltxv_0.9.7_13B_dev_bf16.safetensors", "ckpts/ltxv_0.9.7_13B_dev_quanto_bf16_int8.safetensors"] +ltxv_choices= ["ckpts/ltxv_0.9.7_13B_dev_bf16.safetensors", "ckpts/ltxv_0.9.7_13B_dev_quanto_bf16_int8.safetensors", "ckpts/ltxv_0.9.7_13B_distilled_bf16.safetensors", "ckpts/ltxv_0.9.7_13B_distilled_quanto_bf16_int8.safetensors"] hunyuan_choices= ["ckpts/hunyuan_video_720_bf16.safetensors", "ckpts/hunyuan_video_720_quanto_int8.safetensors", "ckpts/hunyuan_video_i2v_720_bf16.safetensors", "ckpts/hunyuan_video_i2v_720_quanto_int8v2.safetensors", "ckpts/hunyuan_video_custom_720_bf16.safetensors", "ckpts/hunyuan_video_custom_720_quanto_bf16_int8.safetensors" ] @@ -1539,12 +1539,12 @@ def get_dependent_models(model_filename, quantization, dtype_policy ): return [get_model_filename("i2v_720p", quantization, dtype_policy)] else: return [] -model_types = [ "t2v_1.3B", "vace_1.3B", "fun_inp_1.3B", "t2v", "i2v", "i2v_720p", "vace_14B", "fun_inp", "recam_1.3B", "flf2v_720p", "sky_df_1.3B", "sky_df_14B", "sky_df_720p_14B", "phantom_1.3B", "fantasy", "ltxv_13B", "hunyuan", "hunyuan_i2v", "hunyuan_custom"] +model_types = [ "t2v_1.3B", "vace_1.3B", "fun_inp_1.3B", "t2v", "i2v", "i2v_720p", "vace_14B", "fun_inp", "recam_1.3B", "flf2v_720p", "sky_df_1.3B", "sky_df_14B", "sky_df_720p_14B", "phantom_1.3B", "fantasy", "ltxv_13B", "ltxv_13B_distilled", "hunyuan", "hunyuan_i2v", "hunyuan_custom"] model_signatures = {"t2v": "text2video_14B", "t2v_1.3B" : "text2video_1.3B", "fun_inp_1.3B" : "Fun_InP_1.3B", "fun_inp" : "Fun_InP_14B", "i2v" : "image2video_480p", "i2v_720p" : "image2video_720p" , "vace_1.3B" : "Vace_1.3B", "vace_14B" : "Vace_14B","recam_1.3B": "recammaster_1.3B", "flf2v_720p" : "FLF2V_720p", "sky_df_1.3B" : "sky_reels2_diffusion_forcing_1.3B", "sky_df_14B" : "sky_reels2_diffusion_forcing_14B", "sky_df_720p_14B" : "sky_reels2_diffusion_forcing_720p_14B", - "phantom_1.3B" : "phantom_1.3B", "fantasy" : "fantasy", "ltxv_13B" : "ltxv_0.9.7_13B", "hunyuan" : "hunyuan_video_720", "hunyuan_i2v" : "hunyuan_video_i2v_720", "hunyuan_custom" : "hunyuan_video_custom" } + "phantom_1.3B" : "phantom_1.3B", "fantasy" : "fantasy", "ltxv_13B" : "ltxv_0.9.7_13B_dev", "ltxv_13B_distilled" : "ltxv_0.9.7_13B_distilled", "hunyuan" : "hunyuan_video_720", "hunyuan_i2v" : "hunyuan_video_i2v_720", "hunyuan_custom" : "hunyuan_video_custom" } def get_model_type(model_filename): @@ -1606,10 +1606,12 @@ def get_model_name(model_filename, description_container = [""]): model_name = "Wan2.1 Fantasy Speaking 720p" model_name += " 14B" if "14B" in model_filename else " 1.3B" description = "The Fantasy Speaking model corresponds to the original Wan image 2 video model combined with the Fantasy Speaking extension to process an audio Input." - elif "ltxv" in model_filename: - model_name = "LTX Video" - model_name += " 0.9.7 13B" if "13B" in model_filename else " 0.9.6 2B" + elif "ltxv_0.9.7_13B_dev" in model_filename: + model_name = "LTX Video 0.9.7" description = "LTX Video is a fast model that can be used to generate long videos (up to 260 frames).It is recommended to keep the number of steps to 30 or you will need to update the file 'ltxv_video/configs/ltxv-13b-0.9.7-dev.yaml'.The LTX Video model expects very long prompt, so don't hesitate to use the Prompt Enhancer." + elif "ltxv_0.9.7_13B_distilled" in model_filename: + model_name = "LTX Video 0.9.7 distilled" + description = "LTX Video is a fast model that can be used to generate long videos (up to 260 frames).This is the distilled / fast version. The LTX Video model expects very long prompt, so don't hesitate to use the Prompt Enhancer." elif "hunyuan_video_720" in model_filename: model_name = "Hunyuan Video text2video 720p" description = "Probably the best text 2 video model available."