mirror of
https://github.com/Wan-Video/Wan2.1.git
synced 2025-11-04 22:26:36 +00:00
more fixes
This commit is contained in:
parent
6d9e60b309
commit
90b52d86ff
@ -189,9 +189,9 @@ class TextEncoder(nn.Module):
|
|||||||
if "llm" in text_encoder_type:
|
if "llm" in text_encoder_type:
|
||||||
from mmgp import offload
|
from mmgp import offload
|
||||||
forcedConfigPath= None if "i2v" in text_encoder_type else "ckpts/llava-llama-3-8b/config.json"
|
forcedConfigPath= None if "i2v" in text_encoder_type else "ckpts/llava-llama-3-8b/config.json"
|
||||||
self.model= offload.fast_load_transformers_model(self.model_path, forcedConfigPath=forcedConfigPath, modelPrefix= "model" if forcedConfigPath !=None else None)
|
self.model= offload.fast_load_transformers_model(self.model_path, forcedConfigPath=forcedConfigPath)
|
||||||
if forcedConfigPath != None:
|
if forcedConfigPath != None:
|
||||||
self.model.final_layer_norm = self.model.norm
|
self.model.final_layer_norm = self.model.model.norm
|
||||||
|
|
||||||
else:
|
else:
|
||||||
self.model, self.model_path = load_text_encoder(
|
self.model, self.model_path = load_text_encoder(
|
||||||
|
|||||||
@ -155,14 +155,14 @@ class LTXV:
|
|||||||
):
|
):
|
||||||
|
|
||||||
self.mixed_precision_transformer = mixed_precision_transformer
|
self.mixed_precision_transformer = mixed_precision_transformer
|
||||||
# ckpt_path = Path(ckpt_path)
|
self.distilled = "distilled" in model_filepath[0]
|
||||||
# with safe_open(ckpt_path, framework="pt") as f:
|
# with safe_open(ckpt_path, framework="pt") as f:
|
||||||
# metadata = f.metadata()
|
# metadata = f.metadata()
|
||||||
# config_str = metadata.get("config")
|
# config_str = metadata.get("config")
|
||||||
# configs = json.loads(config_str)
|
# configs = json.loads(config_str)
|
||||||
# allowed_inference_steps = configs.get("allowed_inference_steps", None)
|
# allowed_inference_steps = configs.get("allowed_inference_steps", None)
|
||||||
# transformer = Transformer3DModel.from_pretrained(ckpt_path)
|
# transformer = Transformer3DModel.from_pretrained(ckpt_path)
|
||||||
# offload.save_model(transformer, "ckpts/ltxv_0.9.7_13B_dev_bf16.safetensors", config_file_path="config_transformer.json")
|
# transformer = offload.fast_load_transformers_model("c:/temp/ltxdistilled/diffusion_pytorch_model-00001-of-00006.safetensors", forcedConfigPath="c:/temp/ltxdistilled/config.json")
|
||||||
|
|
||||||
# vae = CausalVideoAutoencoder.from_pretrained(ckpt_path)
|
# vae = CausalVideoAutoencoder.from_pretrained(ckpt_path)
|
||||||
vae = offload.fast_load_transformers_model("ckpts/ltxv_0.9.7_VAE.safetensors", modelClass=CausalVideoAutoencoder)
|
vae = offload.fast_load_transformers_model("ckpts/ltxv_0.9.7_VAE.safetensors", modelClass=CausalVideoAutoencoder)
|
||||||
@ -174,8 +174,11 @@ class LTXV:
|
|||||||
# vae = offload.fast_load_transformers_model("vae.safetensors", modelClass=CausalVideoAutoencoder, modelPrefix= "vae", forcedConfigPath="config_vae.json")
|
# vae = offload.fast_load_transformers_model("vae.safetensors", modelClass=CausalVideoAutoencoder, modelPrefix= "vae", forcedConfigPath="config_vae.json")
|
||||||
# offload.save_model(vae, "vae.safetensors", config_file_path="config_vae.json")
|
# offload.save_model(vae, "vae.safetensors", config_file_path="config_vae.json")
|
||||||
|
|
||||||
|
# model_filepath = "c:/temp/ltxd/ltxv-13b-0.9.7-distilled.safetensors"
|
||||||
transformer = offload.fast_load_transformers_model(model_filepath, modelClass=Transformer3DModel)
|
transformer = offload.fast_load_transformers_model(model_filepath, modelClass=Transformer3DModel, forcedConfigPath= "c:/temp/ltxd/config.json")
|
||||||
|
# offload.save_model(transformer, "ckpts/ltxv_0.9.7_13B_distilled_bf16.safetensors", config_file_path= "c:/temp/ltxd/config.json")
|
||||||
|
# offload.save_model(transformer, "ckpts/ltxv_0.9.7_13B_distilled_quanto_bf16_int8.safetensors", do_quantize= True, config_file_path="c:/temp/ltxd/config.json")
|
||||||
|
# transformer = offload.fast_load_transformers_model(model_filepath, modelClass=Transformer3DModel)
|
||||||
transformer._model_dtype = dtype
|
transformer._model_dtype = dtype
|
||||||
if mixed_precision_transformer:
|
if mixed_precision_transformer:
|
||||||
transformer._lock_dtype = torch.float
|
transformer._lock_dtype = torch.float
|
||||||
@ -295,6 +298,9 @@ class LTXV:
|
|||||||
conditioning_media_paths = None
|
conditioning_media_paths = None
|
||||||
conditioning_start_frames = None
|
conditioning_start_frames = None
|
||||||
|
|
||||||
|
if self.distilled :
|
||||||
|
pipeline_config = "ltx_video/configs/ltxv-13b-0.9.7-distilled.yaml"
|
||||||
|
else:
|
||||||
pipeline_config = "ltx_video/configs/ltxv-13b-0.9.7-dev.yaml"
|
pipeline_config = "ltx_video/configs/ltxv-13b-0.9.7-dev.yaml"
|
||||||
# check if pipeline_config is a file
|
# check if pipeline_config is a file
|
||||||
if not os.path.isfile(pipeline_config):
|
if not os.path.isfile(pipeline_config):
|
||||||
|
|||||||
@ -1,4 +1,4 @@
|
|||||||
# import av
|
import av
|
||||||
import torch
|
import torch
|
||||||
import io
|
import io
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|||||||
@ -32,4 +32,5 @@ hydra-core
|
|||||||
librosa
|
librosa
|
||||||
loguru
|
loguru
|
||||||
sentencepiece
|
sentencepiece
|
||||||
|
av
|
||||||
# rembg==2.0.65
|
# rembg==2.0.65
|
||||||
|
|||||||
@ -80,9 +80,9 @@ class WanT2V:
|
|||||||
|
|
||||||
logging.info(f"Creating WanModel from {model_filename[-1]}")
|
logging.info(f"Creating WanModel from {model_filename[-1]}")
|
||||||
from mmgp import offload
|
from mmgp import offload
|
||||||
# model_filename = "c:/temp/vace/diffusion_pytorch_model-00001-of-00007.safetensors"
|
# model_filename = "c:/temp/vace1.3/diffusion_pytorch_model.safetensors"
|
||||||
# model_filename = "vace14B_quanto_bf16_int8.safetensors"
|
# model_filename = "vace14B_quanto_bf16_int8.safetensors"
|
||||||
self.model = offload.fast_load_transformers_model(model_filename, modelClass=WanModel,do_quantize= quantizeTransformer, writable_tensors= False) # , forcedConfigPath= "c:/temp/vace/vace_config.json")
|
self.model = offload.fast_load_transformers_model(model_filename, modelClass=WanModel,do_quantize= quantizeTransformer, writable_tensors= False , forcedConfigPath= "c:/temp/vace1.3/config.json")
|
||||||
# offload.load_model_data(self.model, "e:/vace.safetensors")
|
# offload.load_model_data(self.model, "e:/vace.safetensors")
|
||||||
# offload.load_model_data(self.model, "c:/temp/Phantom-Wan-1.3B.pth")
|
# offload.load_model_data(self.model, "c:/temp/Phantom-Wan-1.3B.pth")
|
||||||
# self.model.to(torch.bfloat16)
|
# self.model.to(torch.bfloat16)
|
||||||
@ -90,7 +90,7 @@ class WanT2V:
|
|||||||
self.model.lock_layers_dtypes(torch.float32 if mixed_precision_transformer else dtype)
|
self.model.lock_layers_dtypes(torch.float32 if mixed_precision_transformer else dtype)
|
||||||
# dtype = torch.bfloat16
|
# dtype = torch.bfloat16
|
||||||
offload.change_dtype(self.model, dtype, True)
|
offload.change_dtype(self.model, dtype, True)
|
||||||
# offload.save_model(self.model, "vace14B_bf16.safetensors", config_file_path="c:/temp/vace/vace_config.json")
|
# offload.save_model(self.model, "wan2.1_Vace1.3B_mbf16.safetensors", config_file_path="c:/temp/vace1.3/config.json")
|
||||||
# offload.save_model(self.model, "vace14B_quanto_fp16_int8.safetensors", do_quantize= True, config_file_path="c:/temp/vace/vace_config.json")
|
# offload.save_model(self.model, "vace14B_quanto_fp16_int8.safetensors", do_quantize= True, config_file_path="c:/temp/vace/vace_config.json")
|
||||||
self.model.eval().requires_grad_(False)
|
self.model.eval().requires_grad_(False)
|
||||||
|
|
||||||
|
|||||||
14
wgp.py
14
wgp.py
@ -1528,7 +1528,7 @@ wan_choices_i2v=["ckpts/wan2.1_image2video_480p_14B_mbf16.safetensors", "ckpts/w
|
|||||||
"ckpts/wan2.1_image2video_720p_14B_quanto_mbf16_int8.safetensors", "ckpts/wan2.1_Fun_InP_1.3B_bf16.safetensors", "ckpts/wan2.1_Fun_InP_14B_bf16.safetensors",
|
"ckpts/wan2.1_image2video_720p_14B_quanto_mbf16_int8.safetensors", "ckpts/wan2.1_Fun_InP_1.3B_bf16.safetensors", "ckpts/wan2.1_Fun_InP_14B_bf16.safetensors",
|
||||||
"ckpts/wan2.1_Fun_InP_14B_quanto_int8.safetensors", "ckpts/wan2.1_FLF2V_720p_14B_bf16.safetensors", "ckpts/wan2.1_FLF2V_720p_14B_quanto_int8.safetensors",
|
"ckpts/wan2.1_Fun_InP_14B_quanto_int8.safetensors", "ckpts/wan2.1_FLF2V_720p_14B_bf16.safetensors", "ckpts/wan2.1_FLF2V_720p_14B_quanto_int8.safetensors",
|
||||||
"ckpts/wan2.1_fantasy_speaking_14B_bf16.safetensors"]
|
"ckpts/wan2.1_fantasy_speaking_14B_bf16.safetensors"]
|
||||||
ltxv_choices= ["ckpts/ltxv_0.9.7_13B_dev_bf16.safetensors", "ckpts/ltxv_0.9.7_13B_dev_quanto_bf16_int8.safetensors"]
|
ltxv_choices= ["ckpts/ltxv_0.9.7_13B_dev_bf16.safetensors", "ckpts/ltxv_0.9.7_13B_dev_quanto_bf16_int8.safetensors", "ckpts/ltxv_0.9.7_13B_distilled_bf16.safetensors", "ckpts/ltxv_0.9.7_13B_distilled_quanto_bf16_int8.safetensors"]
|
||||||
|
|
||||||
hunyuan_choices= ["ckpts/hunyuan_video_720_bf16.safetensors", "ckpts/hunyuan_video_720_quanto_int8.safetensors", "ckpts/hunyuan_video_i2v_720_bf16.safetensors", "ckpts/hunyuan_video_i2v_720_quanto_int8v2.safetensors",
|
hunyuan_choices= ["ckpts/hunyuan_video_720_bf16.safetensors", "ckpts/hunyuan_video_720_quanto_int8.safetensors", "ckpts/hunyuan_video_i2v_720_bf16.safetensors", "ckpts/hunyuan_video_i2v_720_quanto_int8v2.safetensors",
|
||||||
"ckpts/hunyuan_video_custom_720_bf16.safetensors", "ckpts/hunyuan_video_custom_720_quanto_bf16_int8.safetensors" ]
|
"ckpts/hunyuan_video_custom_720_bf16.safetensors", "ckpts/hunyuan_video_custom_720_quanto_bf16_int8.safetensors" ]
|
||||||
@ -1539,12 +1539,12 @@ def get_dependent_models(model_filename, quantization, dtype_policy ):
|
|||||||
return [get_model_filename("i2v_720p", quantization, dtype_policy)]
|
return [get_model_filename("i2v_720p", quantization, dtype_policy)]
|
||||||
else:
|
else:
|
||||||
return []
|
return []
|
||||||
model_types = [ "t2v_1.3B", "vace_1.3B", "fun_inp_1.3B", "t2v", "i2v", "i2v_720p", "vace_14B", "fun_inp", "recam_1.3B", "flf2v_720p", "sky_df_1.3B", "sky_df_14B", "sky_df_720p_14B", "phantom_1.3B", "fantasy", "ltxv_13B", "hunyuan", "hunyuan_i2v", "hunyuan_custom"]
|
model_types = [ "t2v_1.3B", "vace_1.3B", "fun_inp_1.3B", "t2v", "i2v", "i2v_720p", "vace_14B", "fun_inp", "recam_1.3B", "flf2v_720p", "sky_df_1.3B", "sky_df_14B", "sky_df_720p_14B", "phantom_1.3B", "fantasy", "ltxv_13B", "ltxv_13B_distilled", "hunyuan", "hunyuan_i2v", "hunyuan_custom"]
|
||||||
model_signatures = {"t2v": "text2video_14B", "t2v_1.3B" : "text2video_1.3B", "fun_inp_1.3B" : "Fun_InP_1.3B", "fun_inp" : "Fun_InP_14B",
|
model_signatures = {"t2v": "text2video_14B", "t2v_1.3B" : "text2video_1.3B", "fun_inp_1.3B" : "Fun_InP_1.3B", "fun_inp" : "Fun_InP_14B",
|
||||||
"i2v" : "image2video_480p", "i2v_720p" : "image2video_720p" , "vace_1.3B" : "Vace_1.3B", "vace_14B" : "Vace_14B","recam_1.3B": "recammaster_1.3B",
|
"i2v" : "image2video_480p", "i2v_720p" : "image2video_720p" , "vace_1.3B" : "Vace_1.3B", "vace_14B" : "Vace_14B","recam_1.3B": "recammaster_1.3B",
|
||||||
"flf2v_720p" : "FLF2V_720p", "sky_df_1.3B" : "sky_reels2_diffusion_forcing_1.3B", "sky_df_14B" : "sky_reels2_diffusion_forcing_14B",
|
"flf2v_720p" : "FLF2V_720p", "sky_df_1.3B" : "sky_reels2_diffusion_forcing_1.3B", "sky_df_14B" : "sky_reels2_diffusion_forcing_14B",
|
||||||
"sky_df_720p_14B" : "sky_reels2_diffusion_forcing_720p_14B",
|
"sky_df_720p_14B" : "sky_reels2_diffusion_forcing_720p_14B",
|
||||||
"phantom_1.3B" : "phantom_1.3B", "fantasy" : "fantasy", "ltxv_13B" : "ltxv_0.9.7_13B", "hunyuan" : "hunyuan_video_720", "hunyuan_i2v" : "hunyuan_video_i2v_720", "hunyuan_custom" : "hunyuan_video_custom" }
|
"phantom_1.3B" : "phantom_1.3B", "fantasy" : "fantasy", "ltxv_13B" : "ltxv_0.9.7_13B_dev", "ltxv_13B_distilled" : "ltxv_0.9.7_13B_distilled", "hunyuan" : "hunyuan_video_720", "hunyuan_i2v" : "hunyuan_video_i2v_720", "hunyuan_custom" : "hunyuan_video_custom" }
|
||||||
|
|
||||||
|
|
||||||
def get_model_type(model_filename):
|
def get_model_type(model_filename):
|
||||||
@ -1606,10 +1606,12 @@ def get_model_name(model_filename, description_container = [""]):
|
|||||||
model_name = "Wan2.1 Fantasy Speaking 720p"
|
model_name = "Wan2.1 Fantasy Speaking 720p"
|
||||||
model_name += " 14B" if "14B" in model_filename else " 1.3B"
|
model_name += " 14B" if "14B" in model_filename else " 1.3B"
|
||||||
description = "The Fantasy Speaking model corresponds to the original Wan image 2 video model combined with the Fantasy Speaking extension to process an audio Input."
|
description = "The Fantasy Speaking model corresponds to the original Wan image 2 video model combined with the Fantasy Speaking extension to process an audio Input."
|
||||||
elif "ltxv" in model_filename:
|
elif "ltxv_0.9.7_13B_dev" in model_filename:
|
||||||
model_name = "LTX Video"
|
model_name = "LTX Video 0.9.7"
|
||||||
model_name += " 0.9.7 13B" if "13B" in model_filename else " 0.9.6 2B"
|
|
||||||
description = "LTX Video is a fast model that can be used to generate long videos (up to 260 frames).It is recommended to keep the number of steps to 30 or you will need to update the file 'ltxv_video/configs/ltxv-13b-0.9.7-dev.yaml'.The LTX Video model expects very long prompt, so don't hesitate to use the Prompt Enhancer."
|
description = "LTX Video is a fast model that can be used to generate long videos (up to 260 frames).It is recommended to keep the number of steps to 30 or you will need to update the file 'ltxv_video/configs/ltxv-13b-0.9.7-dev.yaml'.The LTX Video model expects very long prompt, so don't hesitate to use the Prompt Enhancer."
|
||||||
|
elif "ltxv_0.9.7_13B_distilled" in model_filename:
|
||||||
|
model_name = "LTX Video 0.9.7 distilled"
|
||||||
|
description = "LTX Video is a fast model that can be used to generate long videos (up to 260 frames).This is the distilled / fast version. The LTX Video model expects very long prompt, so don't hesitate to use the Prompt Enhancer."
|
||||||
elif "hunyuan_video_720" in model_filename:
|
elif "hunyuan_video_720" in model_filename:
|
||||||
model_name = "Hunyuan Video text2video 720p"
|
model_name = "Hunyuan Video text2video 720p"
|
||||||
description = "Probably the best text 2 video model available."
|
description = "Probably the best text 2 video model available."
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user