mirror of
https://github.com/Wan-Video/Wan2.1.git
synced 2025-12-15 11:43:21 +00:00
the RAM Liberator
This commit is contained in:
parent
dbe744208b
commit
e2273ef726
1
.gitignore
vendored
1
.gitignore
vendored
@ -36,6 +36,7 @@ Wan2.1-T2V-1.3B/
|
||||
Wan2.1-I2V-14B-480P/
|
||||
Wan2.1-I2V-14B-720P/
|
||||
outputs/
|
||||
outputs2/
|
||||
gradio_outputs/
|
||||
ckpts/
|
||||
loras/
|
||||
|
||||
@ -20,6 +20,12 @@ WanGP supports the Wan (and derived models), Hunyuan Video and LTV Video models
|
||||
**Follow DeepBeepMeep on Twitter/X to get the Latest News**: https://x.com/deepbeepmeep
|
||||
|
||||
## 🔥 Latest Updates :
|
||||
### August 24 2025: WanGP v8.1 - the RAM Liberator
|
||||
|
||||
- Reserved RAM entirely freed when switching models, you should get much less out of memory related to RAM. I have also added a button in *Configuration / Performance* that will release most of the RAM used by WanGP if you want to use another application without quitting WanGP
|
||||
- InfiniteTalk support: improved version of Multitalk that supposedly supports very long video generations based on an audio track. Exists in two flavors (*Single Speaker* and *Multi Speakers*) but doesnt seem to be compatible with Vace. One key new feature compared to Multitalk is that you can have different visual shots associated to the same audio: each Reference frame you provide you will be associated to a new Sliding Window. If only Reference frame is provided, it will be used for all windows. When Continuing a video, you can either continue the current shot (no Reference Frame) or add new shots (one or more Reference Frames).
|
||||
- Flux Chroma 1 HD support: uncensored flux based model and lighter than Flux (8.9B versus 12B) and can fit entirely in VRAM with only 16 GB of VRAM. Unfortunalely it is not distilled and you will need CFG at minimum 20 steps
|
||||
|
||||
### August 21 2025: WanGP v8.01 - the killer of seven
|
||||
|
||||
- Qwen Image Edit : Flux Kontext challenger (prompt driven image edition). Best results (including Identity preservation) will be obtained at 720p. Beyond you may get image outpainting and / or lose identity preservation. Below 720p prompt adherence will be worse. Qwen Image Edit works with Qwen Lora Lightning 4 steps. I have also unlocked all the resolutions for Qwen models. Bonus Zone: support for multiple image compositions but identity preservation won't be as good.
|
||||
|
||||
15
configs/infinitetalk.json
Normal file
15
configs/infinitetalk.json
Normal file
@ -0,0 +1,15 @@
|
||||
{
|
||||
"_class_name": "WanModel",
|
||||
"_diffusers_version": "0.30.0",
|
||||
"dim": 5120,
|
||||
"eps": 1e-06,
|
||||
"ffn_dim": 13824,
|
||||
"freq_dim": 256,
|
||||
"in_dim": 36,
|
||||
"model_type": "i2v",
|
||||
"num_heads": 40,
|
||||
"num_layers": 40,
|
||||
"out_dim": 16,
|
||||
"text_len": 512,
|
||||
"multitalk_output_dim": 768
|
||||
}
|
||||
@ -3,7 +3,7 @@
|
||||
{
|
||||
"name": "Fantasy Talking 720p",
|
||||
"architecture" : "fantasy",
|
||||
"modules": ["fantasy"],
|
||||
"modules": [ ["https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/wan2.1_fantasy_speaking_14B_bf16.safetensors"]],
|
||||
"description": "The Fantasy Talking model corresponds to the original Wan image 2 video model combined with the Fantasy Speaking module to process an audio Input.",
|
||||
"URLs": "i2v_720p"
|
||||
},
|
||||
|
||||
18
defaults/flux_chroma.json
Normal file
18
defaults/flux_chroma.json
Normal file
@ -0,0 +1,18 @@
|
||||
{
|
||||
"model": {
|
||||
"name": "Flux 1 Chroma 1 HD 8.9B",
|
||||
"architecture": "flux",
|
||||
"description": "FLUX.1 Chroma is a 8.9 billion parameters model. As a base model, Chroma1 is intentionally designed to be an excellent starting point for finetuning. It provides a strong, neutral foundation for developers, researchers, and artists to create specialized models..",
|
||||
"URLs": [
|
||||
"https://huggingface.co/DeepBeepMeep/Flux/resolve/main/flux1-chroma_hd_bf16.safetensors",
|
||||
"https://huggingface.co/DeepBeepMeep/Flux/resolve/main/flux1-chroma_hd_quanto_bf16_int8.safetensors"
|
||||
],
|
||||
"image_outputs": true,
|
||||
"flux-model": "flux-chroma"
|
||||
},
|
||||
"prompt": "draw a hat",
|
||||
"resolution": "1280x720",
|
||||
"guidance_scale": 3.0,
|
||||
"num_inference_steps": 20,
|
||||
"batch_size": 1
|
||||
}
|
||||
16
defaults/infinitetalk.json
Normal file
16
defaults/infinitetalk.json
Normal file
@ -0,0 +1,16 @@
|
||||
{
|
||||
"model": {
|
||||
"name": "Infinitetalk Single Speaker 480p",
|
||||
"architecture": "infinitetalk",
|
||||
"modules": [
|
||||
[
|
||||
"https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/wan2.1_infinitetalk_single_14B_mbf16.safetensors",
|
||||
"https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/wan2.1_infinitetalk_single_14B_quanto_mbf16_int8.safetensors",
|
||||
"https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/wan2.1_infinitetalk_single_14B_quanto_mfp16_int8.safetensors"
|
||||
]
|
||||
],
|
||||
"description": "The Infinitetalk model is an improved version of Multitalk that supports very long videos. This is the single speaker version.",
|
||||
"one_speaker_only": true,
|
||||
"URLs": "i2v"
|
||||
}
|
||||
}
|
||||
16
defaults/infinitetalk_multi.json
Normal file
16
defaults/infinitetalk_multi.json
Normal file
@ -0,0 +1,16 @@
|
||||
{
|
||||
"model": {
|
||||
"name": "Infinitetalk Multi Speakers 480p",
|
||||
"architecture": "infinitetalk",
|
||||
"modules": [
|
||||
[
|
||||
"https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/wan2.1_infinitetalk_multi_14B_mbf16.safetensors",
|
||||
"https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/wan2.1_infinitetalk_multi_14B_quanto_mfp16_int8.safetensors",
|
||||
"https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/wan2.1_infinitetalk_multi_14B_quanto_mbf16_int8.safetensors"
|
||||
]
|
||||
],
|
||||
"description": "The Infinitetalk model is an improved version of Multitalk that supports very long videos. This is the multi speakers version.",
|
||||
"multi_speakers_only": true,
|
||||
"URLs": "i2v"
|
||||
}
|
||||
}
|
||||
@ -3,7 +3,11 @@
|
||||
{
|
||||
"name": "Multitalk 480p",
|
||||
"architecture" : "multitalk",
|
||||
"modules": ["multitalk"],
|
||||
"modules": [
|
||||
["https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/wan2.1_multitalk_14B_mbf16.safetensors",
|
||||
"https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/wan2.1_multitalk_14B_quanto_mbf16_int8.safetensors",
|
||||
"https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/wan2.1_multitalk_14B_quanto_mfp16_int8.safetensors"]
|
||||
],
|
||||
"description": "The Multitalk model corresponds to the original Wan image 2 video model combined with the Multitalk module. It lets you have up to two people have a conversation.",
|
||||
"URLs": "i2v",
|
||||
"teacache_coefficients" : [-3.02331670e+02, 2.23948934e+02, -5.25463970e+01, 5.87348440e+00, -2.01973289e-01]
|
||||
|
||||
@ -3,8 +3,10 @@
|
||||
{
|
||||
"name": "Vace ControlNet 1.3B",
|
||||
"architecture" : "vace_1.3B",
|
||||
"modules": ["vace_1.3B"],
|
||||
"description": "The Vace ControlNet model is a powerful model that allows you to control the content of the generated video based of additional custom data : pose or depth video, images or objects you want to see in the video.",
|
||||
"modules": [
|
||||
["https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/wan2.1_Vace_1_3B_module.safetensors"]
|
||||
],
|
||||
"description": "The Vace ControlNet model is a powerful model that allows you to control the content of the generated video based of additional custom data : pose or depth video, images or objects you want to see in the video.",
|
||||
"URLs": "t2v_1.3B"
|
||||
}
|
||||
}
|
||||
@ -3,7 +3,9 @@
|
||||
"name": "Vace ControlNet 14B",
|
||||
"architecture": "vace_14B",
|
||||
"modules": [
|
||||
"vace_14B"
|
||||
["https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/wan2.1_Vace_14B_module_mbf16.safetensors",
|
||||
"https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/wan2.1_Vace_14B_module_quanto_mbf16_int8.safetensors",
|
||||
"https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/wan2.1_Vace_14B_module_quanto_mfp16_int8.safetensors"]
|
||||
],
|
||||
"description": "The Vace ControlNet model is a powerful model that allows you to control the content of the generated video based of additional custom data : pose or depth video, images or objects you want to see in the video.",
|
||||
"URLs": "t2v"
|
||||
|
||||
@ -11,11 +11,14 @@ class family_handler():
|
||||
def query_model_def(base_model_type, model_def):
|
||||
flux_model = model_def.get("flux-model", "flux-dev")
|
||||
flux_schnell = flux_model == "flux-schnell"
|
||||
flux_chroma = flux_model == "flux-chroma"
|
||||
model_def_output = {
|
||||
"image_outputs" : True,
|
||||
"no_negative_prompt" : True,
|
||||
"no_negative_prompt" : not flux_chroma,
|
||||
}
|
||||
if not flux_schnell:
|
||||
if flux_chroma:
|
||||
model_def_output["guidance_max_phases"] = 1
|
||||
elif not flux_schnell:
|
||||
model_def_output["embedded_guidance"] = True
|
||||
|
||||
|
||||
|
||||
@ -50,6 +50,8 @@ class model_factory:
|
||||
self.VAE_dtype = VAE_dtype
|
||||
self.dtype = dtype
|
||||
torch_device = "cpu"
|
||||
self.guidance_max_phases = model_def.get("guidance_max_phases", 0)
|
||||
|
||||
# model_filename = ["c:/temp/flux1-schnell.safetensors"]
|
||||
|
||||
self.t5 = load_t5(torch_device, text_encoder_filename, max_length=512)
|
||||
@ -83,22 +85,27 @@ class model_factory:
|
||||
self,
|
||||
seed: int | None = None,
|
||||
input_prompt: str = "replace the logo with the text 'Black Forest Labs'",
|
||||
n_prompt: str = None,
|
||||
sampling_steps: int = 20,
|
||||
input_ref_images = None,
|
||||
width= 832,
|
||||
height=480,
|
||||
embedded_guidance_scale: float = 2.5,
|
||||
guide_scale = 2.5,
|
||||
fit_into_canvas = None,
|
||||
callback = None,
|
||||
loras_slists = None,
|
||||
batch_size = 1,
|
||||
video_prompt_type = "",
|
||||
joint_pass = False,
|
||||
**bbargs
|
||||
):
|
||||
|
||||
if self._interrupt:
|
||||
return None
|
||||
|
||||
if self.guidance_max_phases < 1:
|
||||
guide_scale = 1
|
||||
if n_prompt is None or len(n_prompt) == 0:
|
||||
n_prompt = "low quality, ugly, unfinished, out of focus, deformed, disfigure, blurry, smudged, restricted palette, flat colors"
|
||||
device="cuda"
|
||||
if "I" in video_prompt_type and input_ref_images != None and len(input_ref_images) > 0:
|
||||
if "K" in video_prompt_type and False :
|
||||
@ -122,6 +129,7 @@ class model_factory:
|
||||
t5=self.t5,
|
||||
clip=self.clip,
|
||||
prompt=input_prompt,
|
||||
neg_prompt= n_prompt,
|
||||
ae=self.vae,
|
||||
img_cond_list=input_ref_images,
|
||||
target_width=width,
|
||||
@ -129,13 +137,14 @@ class model_factory:
|
||||
bs=batch_size,
|
||||
seed=seed,
|
||||
device=device,
|
||||
real_guidance_scale=guide_scale,
|
||||
)
|
||||
|
||||
timesteps = get_schedule(sampling_steps, inp["img"].shape[1], shift=(self.name != "flux-schnell"))
|
||||
def unpack_latent(x):
|
||||
return unpack(x.float(), height, width)
|
||||
# denoise initial noise
|
||||
x = denoise(self.model, **inp, timesteps=timesteps, guidance=embedded_guidance_scale, callback=callback, pipeline=self, loras_slists= loras_slists, unpack_latent = unpack_latent)
|
||||
x = denoise(self.model, **inp, timesteps=timesteps, guidance=embedded_guidance_scale, real_guidance_scale =guide_scale, callback=callback, pipeline=self, loras_slists= loras_slists, unpack_latent = unpack_latent, joint_pass = joint_pass)
|
||||
if x==None: return None
|
||||
# decode latents to pixel space
|
||||
x = unpack_latent(x)
|
||||
|
||||
@ -10,6 +10,8 @@ from .modules.layers import (
|
||||
MLPEmbedder,
|
||||
SingleStreamBlock,
|
||||
timestep_embedding,
|
||||
DistilledGuidance,
|
||||
ChromaModulationOut,
|
||||
)
|
||||
from .modules.lora import LinearLora, replace_linear_with_lora
|
||||
|
||||
@ -29,19 +31,47 @@ class FluxParams:
|
||||
theta: int
|
||||
qkv_bias: bool
|
||||
guidance_embed: bool
|
||||
chroma: bool = False
|
||||
|
||||
|
||||
class Flux(nn.Module):
|
||||
"""
|
||||
Transformer model for flow matching on sequences.
|
||||
"""
|
||||
|
||||
def get_modulations(self, tensor: torch.Tensor, block_type: str, *, idx: int = 0):
|
||||
# This function slices up the modulations tensor which has the following layout:
|
||||
# single : num_single_blocks * 3 elements
|
||||
# double_img : num_double_blocks * 6 elements
|
||||
# double_txt : num_double_blocks * 6 elements
|
||||
# final : 2 elements
|
||||
if block_type == "final":
|
||||
return (tensor[:, -2:-1, :], tensor[:, -1:, :])
|
||||
single_block_count = self.params.depth_single_blocks
|
||||
double_block_count = self.params.depth
|
||||
offset = 3 * idx
|
||||
if block_type == "single":
|
||||
return ChromaModulationOut.from_offset(tensor, offset)
|
||||
# Double block modulations are 6 elements so we double 3 * idx.
|
||||
offset *= 2
|
||||
if block_type in {"double_img", "double_txt"}:
|
||||
# Advance past the single block modulations.
|
||||
offset += 3 * single_block_count
|
||||
if block_type == "double_txt":
|
||||
# Advance past the double block img modulations.
|
||||
offset += 6 * double_block_count
|
||||
return (
|
||||
ChromaModulationOut.from_offset(tensor, offset),
|
||||
ChromaModulationOut.from_offset(tensor, offset + 3),
|
||||
)
|
||||
raise ValueError("Bad block_type")
|
||||
|
||||
def __init__(self, params: FluxParams):
|
||||
super().__init__()
|
||||
|
||||
self.params = params
|
||||
self.in_channels = params.in_channels
|
||||
self.out_channels = params.out_channels
|
||||
self.chroma = params.chroma
|
||||
if params.hidden_size % params.num_heads != 0:
|
||||
raise ValueError(
|
||||
f"Hidden size {params.hidden_size} must be divisible by num_heads {params.num_heads}"
|
||||
@ -53,12 +83,21 @@ class Flux(nn.Module):
|
||||
self.num_heads = params.num_heads
|
||||
self.pe_embedder = EmbedND(dim=pe_dim, theta=params.theta, axes_dim=params.axes_dim)
|
||||
self.img_in = nn.Linear(self.in_channels, self.hidden_size, bias=True)
|
||||
self.time_in = MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size)
|
||||
self.vector_in = MLPEmbedder(params.vec_in_dim, self.hidden_size)
|
||||
|
||||
self.guidance_in = (
|
||||
MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size) if params.guidance_embed else nn.Identity()
|
||||
)
|
||||
self.txt_in = nn.Linear(params.context_in_dim, self.hidden_size)
|
||||
if self.chroma:
|
||||
self.distilled_guidance_layer = DistilledGuidance(
|
||||
in_dim=64,
|
||||
hidden_dim=5120,
|
||||
out_dim=3072,
|
||||
n_layers=5,
|
||||
)
|
||||
else:
|
||||
self.time_in = MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size)
|
||||
self.vector_in = MLPEmbedder(params.vec_in_dim, self.hidden_size)
|
||||
|
||||
self.double_blocks = nn.ModuleList(
|
||||
[
|
||||
@ -67,6 +106,7 @@ class Flux(nn.Module):
|
||||
self.num_heads,
|
||||
mlp_ratio=params.mlp_ratio,
|
||||
qkv_bias=params.qkv_bias,
|
||||
chroma_modulation = self.chroma,
|
||||
)
|
||||
for _ in range(params.depth)
|
||||
]
|
||||
@ -74,12 +114,12 @@ class Flux(nn.Module):
|
||||
|
||||
self.single_blocks = nn.ModuleList(
|
||||
[
|
||||
SingleStreamBlock(self.hidden_size, self.num_heads, mlp_ratio=params.mlp_ratio)
|
||||
SingleStreamBlock(self.hidden_size, self.num_heads, mlp_ratio=params.mlp_ratio, chroma_modulation = self.chroma)
|
||||
for _ in range(params.depth_single_blocks)
|
||||
]
|
||||
)
|
||||
|
||||
self.final_layer = LastLayer(self.hidden_size, 1, self.out_channels)
|
||||
self.final_layer = LastLayer(self.hidden_size, 1, self.out_channels, chroma_modulation = self.chroma)
|
||||
|
||||
def preprocess_loras(self, model_type, sd):
|
||||
new_sd = {}
|
||||
@ -155,8 +195,8 @@ class Flux(nn.Module):
|
||||
self,
|
||||
img: Tensor,
|
||||
img_ids: Tensor,
|
||||
txt: Tensor,
|
||||
txt_ids: Tensor,
|
||||
txt_list,
|
||||
txt_ids_list,
|
||||
timesteps: Tensor,
|
||||
y: Tensor,
|
||||
guidance: Tensor | None = None,
|
||||
@ -164,36 +204,63 @@ class Flux(nn.Module):
|
||||
pipeline =None,
|
||||
|
||||
) -> Tensor:
|
||||
if img.ndim != 3 or txt.ndim != 3:
|
||||
raise ValueError("Input img and txt tensors must have 3 dimensions.")
|
||||
|
||||
sz = len(txt_list)
|
||||
# running on sequences img
|
||||
img = self.img_in(img)
|
||||
vec = self.time_in(timestep_embedding(timesteps, 256))
|
||||
if self.params.guidance_embed:
|
||||
if guidance is None:
|
||||
raise ValueError("Didn't get guidance strength for guidance distilled model.")
|
||||
vec += self.guidance_in(timestep_embedding(guidance, 256))
|
||||
vec += self.vector_in(y)
|
||||
txt = self.txt_in(txt)
|
||||
img_list = [img] if sz==1 else [img, img.clone()]
|
||||
|
||||
if self.chroma:
|
||||
mod_index_length = 344
|
||||
distill_timestep = timestep_embedding(timesteps, 16).to(img.device, img.dtype)
|
||||
guidance = torch.tensor([0.]* distill_timestep.shape[0])
|
||||
distil_guidance = timestep_embedding(guidance, 16).to(img.device, img.dtype)
|
||||
modulation_index = timestep_embedding(torch.arange(mod_index_length, device=img.device), 32).to(img.device, img.dtype)
|
||||
modulation_index = modulation_index.unsqueeze(0).repeat(img.shape[0], 1, 1).to(img.device, img.dtype)
|
||||
timestep_guidance = torch.cat([distill_timestep, distil_guidance], dim=1).unsqueeze(1).repeat(1, mod_index_length, 1).to(img.dtype).to(img.device, img.dtype)
|
||||
input_vec = torch.cat([timestep_guidance, modulation_index], dim=-1).to(img.device, img.dtype)
|
||||
mod_vectors = self.distilled_guidance_layer(input_vec)
|
||||
else:
|
||||
vec = self.time_in(timestep_embedding(timesteps, 256))
|
||||
if self.params.guidance_embed:
|
||||
if guidance is None:
|
||||
raise ValueError("Didn't get guidance strength for guidance distilled model.")
|
||||
vec += self.guidance_in(timestep_embedding(guidance, 256))
|
||||
vec += self.vector_in(y)
|
||||
|
||||
ids = torch.cat((txt_ids, img_ids), dim=1)
|
||||
pe = self.pe_embedder(ids)
|
||||
img = None
|
||||
txt_list = [self.txt_in(txt) for txt in txt_list ]
|
||||
pe_list = [self.pe_embedder(torch.cat((txt_ids, img_ids), dim=1)) for txt_ids in txt_ids_list]
|
||||
|
||||
for block in self.double_blocks:
|
||||
for i, block in enumerate(self.double_blocks):
|
||||
if self.chroma: vec = ( self.get_modulations(mod_vectors, "double_img", idx=i), self.get_modulations(mod_vectors, "double_txt", idx=i))
|
||||
if callback != None:
|
||||
callback(-1, None, False, True)
|
||||
if pipeline._interrupt:
|
||||
return None
|
||||
img, txt = block(img=img, txt=txt, vec=vec, pe=pe)
|
||||
return [None] * sz
|
||||
for img, txt, pe in zip(img_list, txt_list, pe_list):
|
||||
img[...], txt[...] = block(img=img, txt=txt, vec=vec, pe=pe)
|
||||
img = txt = pe = None
|
||||
|
||||
img = torch.cat((txt, img), 1)
|
||||
for block in self.single_blocks:
|
||||
img = block(img, vec=vec, pe=pe)
|
||||
img = img[:, txt.shape[1] :, ...]
|
||||
img_list = [torch.cat((txt, img), 1) for txt, img in zip(txt_list, img_list)]
|
||||
|
||||
img = self.final_layer(img, vec) # (N, T, patch_size ** 2 * out_channels)
|
||||
return img
|
||||
for i, block in enumerate(self.single_blocks):
|
||||
if self.chroma: vec = self.get_modulations(mod_vectors, "single", idx=i)
|
||||
if callback != None:
|
||||
callback(-1, None, False, True)
|
||||
if pipeline._interrupt:
|
||||
return [None] * sz
|
||||
for img, pe in zip(img_list, pe_list):
|
||||
img[...]= block(x=img, vec=vec, pe=pe)
|
||||
img = pe = None
|
||||
img_list = [ img[:, txt.shape[1] :, ...] for img, txt in zip(img_list, txt_list)]
|
||||
|
||||
if self.chroma: vec = self.get_modulations(mod_vectors, "final")
|
||||
out_list = []
|
||||
for i, img in enumerate(img_list):
|
||||
out_list.append( self.final_layer(img, vec)) # (N, T, patch_size ** 2 * out_channels)
|
||||
img_list[i] = img = None
|
||||
return out_list
|
||||
|
||||
|
||||
class FluxLoraWrapper(Flux):
|
||||
|
||||
@ -116,6 +116,15 @@ class ModulationOut:
|
||||
scale: Tensor
|
||||
gate: Tensor
|
||||
|
||||
class ChromaModulationOut(ModulationOut):
|
||||
@classmethod
|
||||
def from_offset(cls, tensor: torch.Tensor, offset: int = 0):
|
||||
return cls(
|
||||
shift=tensor[:, offset : offset + 1, :],
|
||||
scale=tensor[:, offset + 1 : offset + 2, :],
|
||||
gate=tensor[:, offset + 2 : offset + 3, :],
|
||||
)
|
||||
|
||||
|
||||
def split_mlp(mlp, x, divide = 8):
|
||||
x_shape = x.shape
|
||||
@ -146,13 +155,15 @@ class Modulation(nn.Module):
|
||||
|
||||
|
||||
class DoubleStreamBlock(nn.Module):
|
||||
def __init__(self, hidden_size: int, num_heads: int, mlp_ratio: float, qkv_bias: bool = False):
|
||||
def __init__(self, hidden_size: int, num_heads: int, mlp_ratio: float, qkv_bias: bool = False, chroma_modulation = False):
|
||||
super().__init__()
|
||||
|
||||
mlp_hidden_dim = int(hidden_size * mlp_ratio)
|
||||
self.num_heads = num_heads
|
||||
self.hidden_size = hidden_size
|
||||
self.img_mod = Modulation(hidden_size, double=True)
|
||||
self.chroma_modulation = chroma_modulation
|
||||
if not chroma_modulation:
|
||||
self.img_mod = Modulation(hidden_size, double=True)
|
||||
self.img_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
|
||||
self.img_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias)
|
||||
|
||||
@ -163,7 +174,8 @@ class DoubleStreamBlock(nn.Module):
|
||||
nn.Linear(mlp_hidden_dim, hidden_size, bias=True),
|
||||
)
|
||||
|
||||
self.txt_mod = Modulation(hidden_size, double=True)
|
||||
if not chroma_modulation:
|
||||
self.txt_mod = Modulation(hidden_size, double=True)
|
||||
self.txt_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
|
||||
self.txt_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias)
|
||||
|
||||
@ -175,8 +187,11 @@ class DoubleStreamBlock(nn.Module):
|
||||
)
|
||||
|
||||
def forward(self, img: Tensor, txt: Tensor, vec: Tensor, pe: Tensor) -> tuple[Tensor, Tensor]:
|
||||
img_mod1, img_mod2 = self.img_mod(vec)
|
||||
txt_mod1, txt_mod2 = self.txt_mod(vec)
|
||||
if self.chroma_modulation:
|
||||
(img_mod1, img_mod2), (txt_mod1, txt_mod2) = vec
|
||||
else:
|
||||
img_mod1, img_mod2 = self.img_mod(vec)
|
||||
txt_mod1, txt_mod2 = self.txt_mod(vec)
|
||||
|
||||
# prepare image for attention
|
||||
img_modulated = self.img_norm1(img)
|
||||
@ -250,10 +265,12 @@ class SingleStreamBlock(nn.Module):
|
||||
num_heads: int,
|
||||
mlp_ratio: float = 4.0,
|
||||
qk_scale: float | None = None,
|
||||
chroma_modulation = False,
|
||||
):
|
||||
super().__init__()
|
||||
self.hidden_dim = hidden_size
|
||||
self.num_heads = num_heads
|
||||
self.chroma_modulation = chroma_modulation
|
||||
head_dim = hidden_size // num_heads
|
||||
self.scale = qk_scale or head_dim**-0.5
|
||||
|
||||
@ -269,10 +286,14 @@ class SingleStreamBlock(nn.Module):
|
||||
self.pre_norm = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
|
||||
|
||||
self.mlp_act = nn.GELU(approximate="tanh")
|
||||
self.modulation = Modulation(hidden_size, double=False)
|
||||
if not chroma_modulation:
|
||||
self.modulation = Modulation(hidden_size, double=False)
|
||||
|
||||
def forward(self, x: Tensor, vec: Tensor, pe: Tensor) -> Tensor:
|
||||
mod, _ = self.modulation(vec)
|
||||
if self.chroma_modulation:
|
||||
mod = vec
|
||||
else:
|
||||
mod, _ = self.modulation(vec)
|
||||
x_mod = self.pre_norm(x)
|
||||
x_mod.mul_(1 + mod.scale)
|
||||
x_mod.add_(mod.shift)
|
||||
@ -316,14 +337,42 @@ class SingleStreamBlock(nn.Module):
|
||||
|
||||
|
||||
class LastLayer(nn.Module):
|
||||
def __init__(self, hidden_size: int, patch_size: int, out_channels: int):
|
||||
def __init__(self, hidden_size: int, patch_size: int, out_channels: int, chroma_modulation = False):
|
||||
super().__init__()
|
||||
self.chroma_modulation = chroma_modulation
|
||||
self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
|
||||
self.linear = nn.Linear(hidden_size, patch_size * patch_size * out_channels, bias=True)
|
||||
self.adaLN_modulation = nn.Sequential(nn.SiLU(), nn.Linear(hidden_size, 2 * hidden_size, bias=True))
|
||||
if not chroma_modulation:
|
||||
self.adaLN_modulation = nn.Sequential(nn.SiLU(), nn.Linear(hidden_size, 2 * hidden_size, bias=True))
|
||||
|
||||
def forward(self, x: Tensor, vec: Tensor) -> Tensor:
|
||||
shift, scale = self.adaLN_modulation(vec).chunk(2, dim=1)
|
||||
x = (1 + scale[:, None, :]) * self.norm_final(x) + shift[:, None, :]
|
||||
if self.chroma_modulation:
|
||||
shift, scale = vec
|
||||
shift = shift.squeeze(1)
|
||||
scale = scale.squeeze(1)
|
||||
else:
|
||||
shift, scale = self.adaLN_modulation(vec).chunk(2, dim=1)
|
||||
# x = (1 + scale[:, None, :]) * self.norm_final(x) + shift[:, None, :]
|
||||
x = torch.addcmul(shift[:, None, :], 1 + scale[:, None, :], self.norm_final(x))
|
||||
x = self.linear(x)
|
||||
return x
|
||||
|
||||
|
||||
class DistilledGuidance(nn.Module):
|
||||
def __init__(self, in_dim: int, out_dim: int, hidden_dim: int, n_layers = 5):
|
||||
super().__init__()
|
||||
self.in_proj = nn.Linear(in_dim, hidden_dim, bias=True)
|
||||
self.layers = nn.ModuleList([MLPEmbedder(hidden_dim, hidden_dim) for x in range( n_layers)])
|
||||
self.norms = nn.ModuleList([RMSNorm(hidden_dim) for x in range( n_layers)])
|
||||
self.out_proj = nn.Linear(hidden_dim, out_dim)
|
||||
|
||||
|
||||
def forward(self, x: Tensor) -> Tensor:
|
||||
x = self.in_proj(x)
|
||||
|
||||
for layer, norms in zip(self.layers, self.norms):
|
||||
x = x + layer(norms(x))
|
||||
|
||||
x = self.out_proj(x)
|
||||
|
||||
return x
|
||||
@ -220,6 +220,9 @@ def prepare_kontext(
|
||||
target_width: int | None = None,
|
||||
target_height: int | None = None,
|
||||
bs: int = 1,
|
||||
neg_prompt: str | list[str] = None,
|
||||
real_guidance_scale = False,
|
||||
|
||||
) -> tuple[dict[str, Tensor], int, int]:
|
||||
# load and encode the conditioning image
|
||||
if bs == 1 and not isinstance(prompt, str):
|
||||
@ -279,8 +282,16 @@ def prepare_kontext(
|
||||
)
|
||||
|
||||
return_dict = prepare(t5, clip, img, prompt)
|
||||
if real_guidance_scale != 1:
|
||||
temp_dict = prepare(t5, clip, img, neg_prompt)
|
||||
return_dict["neg_txt"]= temp_dict["txt"]
|
||||
return_dict["neg_txt_ids"]= temp_dict["txt_ids"]
|
||||
else:
|
||||
return_dict["neg_txt"]= None
|
||||
return_dict["neg_txt_ids"]= None
|
||||
return_dict["img_cond_seq"] = img_cond_seq
|
||||
return_dict["img_cond_seq_ids"] = img_cond_seq_ids
|
||||
|
||||
return return_dict, target_height, target_width
|
||||
|
||||
|
||||
@ -326,7 +337,10 @@ def denoise(
|
||||
# sampling parameters
|
||||
timesteps: list[float],
|
||||
guidance: float = 4.0,
|
||||
real_guidance_scale = None,
|
||||
# extra img tokens (channel-wise)
|
||||
neg_txt: Tensor = None,
|
||||
neg_txt_ids: Tensor= None,
|
||||
img_cond: Tensor | None = None,
|
||||
# extra img tokens (sequence-wise)
|
||||
img_cond_seq: Tensor | None = None,
|
||||
@ -335,6 +349,7 @@ def denoise(
|
||||
pipeline=None,
|
||||
loras_slists=None,
|
||||
unpack_latent = None,
|
||||
joint_pass= False,
|
||||
):
|
||||
|
||||
kwargs = {'pipeline': pipeline, 'callback': callback}
|
||||
@ -360,27 +375,54 @@ def denoise(
|
||||
if img_cond is not None:
|
||||
img_input = torch.cat((img, img_cond), dim=-1)
|
||||
if img_cond_seq is not None:
|
||||
assert (
|
||||
img_cond_seq_ids is not None
|
||||
), "You need to provide either both or neither of the sequence conditioning"
|
||||
img_input = torch.cat((img_input, img_cond_seq), dim=1)
|
||||
img_input_ids = torch.cat((img_input_ids, img_cond_seq_ids), dim=1)
|
||||
pred = model(
|
||||
img=img_input,
|
||||
img_ids=img_input_ids,
|
||||
txt=txt,
|
||||
txt_ids=txt_ids,
|
||||
y=vec,
|
||||
timesteps=t_vec,
|
||||
guidance=guidance_vec,
|
||||
**kwargs
|
||||
)
|
||||
if pred == None: return None
|
||||
if not joint_pass or real_guidance_scale == 1:
|
||||
pred = model(
|
||||
img=img_input,
|
||||
img_ids=img_input_ids,
|
||||
txt_list=[txt],
|
||||
txt_ids_list=[txt_ids],
|
||||
y=vec,
|
||||
timesteps=t_vec,
|
||||
guidance=guidance_vec,
|
||||
**kwargs
|
||||
)[0]
|
||||
if pred == None: return None
|
||||
if real_guidance_scale> 1:
|
||||
neg_pred = model(
|
||||
img=img_input,
|
||||
img_ids=img_input_ids,
|
||||
txt_list=[neg_txt],
|
||||
txt_ids_list=[neg_txt_ids],
|
||||
y=vec,
|
||||
timesteps=t_vec,
|
||||
guidance=guidance_vec,
|
||||
**kwargs
|
||||
)[0]
|
||||
if neg_pred == None: return None
|
||||
else:
|
||||
pred, neg_pred = model(
|
||||
img=img_input,
|
||||
img_ids=img_input_ids,
|
||||
txt_list=[txt, neg_txt],
|
||||
txt_ids_list=[txt_ids, neg_txt_ids],
|
||||
y=vec,
|
||||
timesteps=t_vec,
|
||||
guidance=guidance_vec,
|
||||
**kwargs
|
||||
)
|
||||
if pred == None: return None
|
||||
|
||||
|
||||
if img_input_ids is not None:
|
||||
pred = pred[:, : img.shape[1]]
|
||||
if real_guidance_scale > 1:
|
||||
if img_input_ids is not None:
|
||||
neg_pred = neg_pred[:, : img.shape[1]]
|
||||
pred = neg_pred + real_guidance_scale * (pred - neg_pred)
|
||||
|
||||
img += (t_prev - t_curr) * pred
|
||||
img += (t_prev - t_curr) * pred
|
||||
if callback is not None:
|
||||
preview = unpack_latent(img).transpose(0,1)
|
||||
callback(i, preview, False)
|
||||
|
||||
@ -355,6 +355,38 @@ configs = {
|
||||
shift_factor=0.1159,
|
||||
),
|
||||
),
|
||||
"flux-chroma": ModelSpec(
|
||||
repo_id="lodestones/Chroma1-HD",
|
||||
repo_flow="",
|
||||
repo_ae="ckpts/flux_vae.safetensors",
|
||||
params=FluxParams(
|
||||
in_channels=64,
|
||||
out_channels=64,
|
||||
vec_in_dim=768,
|
||||
context_in_dim=4096,
|
||||
hidden_size=3072,
|
||||
mlp_ratio=4.0,
|
||||
num_heads=24,
|
||||
depth=19,
|
||||
depth_single_blocks=38,
|
||||
axes_dim=[16, 56, 56],
|
||||
theta=10_000,
|
||||
qkv_bias=True,
|
||||
guidance_embed=False,
|
||||
chroma=True,
|
||||
),
|
||||
ae_params=AutoEncoderParams(
|
||||
resolution=256,
|
||||
in_channels=3,
|
||||
ch=128,
|
||||
out_ch=3,
|
||||
ch_mult=[1, 2, 4, 4],
|
||||
num_res_blocks=2,
|
||||
z_channels=16,
|
||||
scale_factor=0.3611,
|
||||
shift_factor=0.1159,
|
||||
),
|
||||
),
|
||||
"flux-dev-canny": ModelSpec(
|
||||
repo_id="black-forest-labs/FLUX.1-Canny-dev",
|
||||
repo_flow="",
|
||||
|
||||
@ -50,6 +50,12 @@ class family_handler():
|
||||
extra_model_def["cfg_star"] = base_model_type in [ "hunyuan_avatar", "hunyuan_custom_audio", "hunyuan_custom_edit", "hunyuan_custom"]
|
||||
extra_model_def["tea_cache"] = True
|
||||
extra_model_def["mag_cache"] = True
|
||||
|
||||
if base_model_type in ["hunyuan_avatar"]: extra_model_def["no_background_removal"] = True
|
||||
|
||||
if base_model_type in ["hunyuan_custom", "hunyuan_custom_edit", "hunyuan_audio", "hunyuan_avatar"]:
|
||||
extra_model_def["one_image_ref_needed"] = True
|
||||
|
||||
return extra_model_def
|
||||
|
||||
@staticmethod
|
||||
|
||||
@ -53,8 +53,7 @@ class model_factory():
|
||||
tokenizer = None
|
||||
if base_model_type == "qwen_image_edit_20B":
|
||||
processor = Qwen2VLProcessor.from_pretrained(os.path.join(checkpoint_dir,"Qwen2.5-VL-7B-Instruct"))
|
||||
else:
|
||||
tokenizer = AutoTokenizer.from_pretrained(os.path.join(checkpoint_dir,"Qwen2.5-VL-7B-Instruct"))
|
||||
tokenizer = AutoTokenizer.from_pretrained(os.path.join(checkpoint_dir,"Qwen2.5-VL-7B-Instruct"))
|
||||
|
||||
|
||||
base_config_file = "configs/qwen_image_20B.json"
|
||||
|
||||
@ -31,7 +31,7 @@ from shared.utils.fm_solvers_unipc import FlowUniPCMultistepScheduler
|
||||
from .modules.posemb_layers import get_rotary_pos_embed
|
||||
from shared.utils.vace_preprocessor import VaceVideoProcessor
|
||||
from shared.utils.basic_flowmatch import FlowMatchScheduler
|
||||
from shared.utils.utils import get_outpainting_frame_location, resize_lanczos, calculate_new_dimensions
|
||||
from shared.utils.utils import get_outpainting_frame_location, resize_lanczos, calculate_new_dimensions, convert_image_to_tensor
|
||||
from .multitalk.multitalk_utils import MomentumBuffer, adaptive_projected_guidance, match_and_blend_colors, match_and_blend_colors_with_mask
|
||||
from mmgp import safetensors2
|
||||
|
||||
@ -127,8 +127,12 @@ class WanAny2V:
|
||||
# model_filename[1] = xmodel_filename
|
||||
|
||||
source = model_def.get("source", None)
|
||||
|
||||
if source is not None:
|
||||
module_source = model_def.get("module_source", None)
|
||||
if module_source is not None:
|
||||
model_filename = [] + model_filename
|
||||
model_filename[1] = module_source
|
||||
self.model = offload.fast_load_transformers_model(model_filename, modelClass=WanModel,do_quantize= quantizeTransformer and not save_quantized, writable_tensors= False, defaultConfigPath=base_config_file , forcedConfigPath= forcedConfigPath)
|
||||
elif source is not None:
|
||||
self.model = offload.fast_load_transformers_model(source, modelClass=WanModel, writable_tensors= False, forcedConfigPath= base_config_file)
|
||||
elif self.transformer_switch:
|
||||
shared_modules= {}
|
||||
@ -153,7 +157,12 @@ class WanAny2V:
|
||||
self.model.eval().requires_grad_(False)
|
||||
if self.model2 is not None:
|
||||
self.model2.eval().requires_grad_(False)
|
||||
if not source is None:
|
||||
if module_source is not None:
|
||||
from wgp import save_model
|
||||
from mmgp.safetensors2 import torch_load_file
|
||||
filter = list(torch_load_file(module_source))
|
||||
save_model(self.model, model_type, dtype, None, is_module=True, filter=filter)
|
||||
elif not source is None:
|
||||
from wgp import save_model
|
||||
save_model(self.model, model_type, dtype, None)
|
||||
|
||||
@ -432,6 +441,7 @@ class WanAny2V:
|
||||
image_mode = 0,
|
||||
window_no = 0,
|
||||
set_header_text = None,
|
||||
pre_video_frame = None,
|
||||
**bbargs
|
||||
):
|
||||
|
||||
@ -500,36 +510,57 @@ class WanAny2V:
|
||||
vace = model_type in ["vace_1.3B","vace_14B", "vace_multitalk_14B"]
|
||||
phantom = model_type in ["phantom_1.3B", "phantom_14B"]
|
||||
fantasy = model_type in ["fantasy"]
|
||||
multitalk = model_type in ["multitalk", "vace_multitalk_14B", "i2v_2_2_multitalk"]
|
||||
multitalk = model_type in ["multitalk", "infinitetalk", "vace_multitalk_14B", "i2v_2_2_multitalk"]
|
||||
infinitetalk = model_type in ["infinitetalk"]
|
||||
recam = model_type in ["recam_1.3B"]
|
||||
ti2v = model_type in ["ti2v_2_2"]
|
||||
start_step_no = 0
|
||||
ref_images_count = 0
|
||||
trim_frames = 0
|
||||
extended_overlapped_latents = None
|
||||
no_noise_latents_injection = infinitetalk
|
||||
timestep_injection = False
|
||||
lat_frames = int((frame_num - 1) // self.vae_stride[0]) + 1
|
||||
# image2video
|
||||
if model_type in ["i2v", "i2v_2_2", "fun_inp_1.3B", "fun_inp", "fantasy", "multitalk", "i2v_2_2_multitalk", "flf2v_720p"]:
|
||||
if model_type in ["i2v", "i2v_2_2", "fun_inp_1.3B", "fun_inp", "fantasy", "multitalk", "infinitetalk", "i2v_2_2_multitalk", "flf2v_720p"]:
|
||||
any_end_frame = False
|
||||
if image_start is None:
|
||||
_ , preframes_count, height, width = input_video.shape
|
||||
if infinitetalk:
|
||||
if pre_video_frame is None:
|
||||
new_shot = True
|
||||
else:
|
||||
if input_ref_images is None:
|
||||
input_ref_images, new_shot = [pre_video_frame], False
|
||||
else:
|
||||
input_ref_images, new_shot = [img.resize(pre_video_frame.size, resample=Image.Resampling.LANCZOS) for img in input_ref_images], True
|
||||
if input_ref_images is None: raise Exception("Missing Reference Image")
|
||||
image_ref = convert_image_to_tensor(input_ref_images[ min(window_no, len(input_ref_images))-1 ])
|
||||
if new_shot and window_no <= len(input_ref_images):
|
||||
input_video = image_ref.unsqueeze(1)
|
||||
_ , preframes_count, height, width = input_video.shape
|
||||
input_video = input_video.to(device=self.device).to(dtype= self.VAE_dtype)
|
||||
if infinitetalk:
|
||||
image_for_clip = image_ref.to(input_video)
|
||||
control_pre_frames_count = 1
|
||||
control_video = image_for_clip.unsqueeze(1)
|
||||
else:
|
||||
image_for_clip = input_video[:, -1]
|
||||
control_pre_frames_count = preframes_count
|
||||
control_video = input_video
|
||||
lat_h, lat_w = height // self.vae_stride[1], width // self.vae_stride[2]
|
||||
if hasattr(self, "clip"):
|
||||
clip_image_size = self.clip.model.image_size
|
||||
clip_image = resize_lanczos(input_video[:, -1], clip_image_size, clip_image_size)[:, None, :, :]
|
||||
clip_image = resize_lanczos(image_for_clip, clip_image_size, clip_image_size)[:, None, :, :]
|
||||
clip_context = self.clip.visual([clip_image]) if model_type != "flf2v_720p" else self.clip.visual([clip_image , clip_image ])
|
||||
clip_image = None
|
||||
else:
|
||||
clip_context = None
|
||||
input_video = input_video.to(device=self.device).to(dtype= self.VAE_dtype)
|
||||
enc = torch.concat( [input_video, torch.zeros( (3, frame_num-preframes_count, height, width),
|
||||
device=self.device, dtype= self.VAE_dtype)],
|
||||
dim = 1).to(self.device)
|
||||
color_reference_frame = input_video[:, -1:].clone()
|
||||
input_video = None
|
||||
enc = torch.concat( [control_video, torch.zeros( (3, frame_num-control_pre_frames_count, height, width),
|
||||
device=self.device, dtype= self.VAE_dtype)],
|
||||
dim = 1).to(self.device)
|
||||
color_reference_frame = image_for_clip.unsqueeze(1).clone()
|
||||
else:
|
||||
preframes_count = 1
|
||||
preframes_count = control_pre_frames_count = 1
|
||||
any_end_frame = image_end is not None
|
||||
add_frames_for_end_image = any_end_frame and model_type == "i2v"
|
||||
if any_end_frame:
|
||||
@ -576,30 +607,34 @@ class WanAny2V:
|
||||
torch.zeros( (3, frame_num-1, height, width), device=self.device, dtype= self.VAE_dtype)
|
||||
], dim=1).to(self.device)
|
||||
|
||||
image_start = image_end = image_start_frame = img_end_frame = None
|
||||
image_start = image_end = image_start_frame = img_end_frame = image_for_clip = image_ref = None
|
||||
|
||||
msk = torch.ones(1, frame_num, lat_h, lat_w, device=self.device)
|
||||
if any_end_frame:
|
||||
msk[:, preframes_count: -1] = 0
|
||||
msk[:, control_pre_frames_count: -1] = 0
|
||||
if add_frames_for_end_image:
|
||||
msk = torch.concat([ torch.repeat_interleave(msk[:, 0:1], repeats=4, dim=1), msk[:, 1:-1], torch.repeat_interleave(msk[:, -1:], repeats=4, dim=1) ], dim=1)
|
||||
else:
|
||||
msk = torch.concat([ torch.repeat_interleave(msk[:, 0:1], repeats=4, dim=1), msk[:, 1:] ], dim=1)
|
||||
else:
|
||||
msk[:, preframes_count:] = 0
|
||||
msk[:, control_pre_frames_count:] = 0
|
||||
msk = torch.concat([ torch.repeat_interleave(msk[:, 0:1], repeats=4, dim=1), msk[:, 1:] ], dim=1)
|
||||
msk = msk.view(1, msk.shape[1] // 4, 4, lat_h, lat_w)
|
||||
msk = msk.transpose(1, 2)[0]
|
||||
|
||||
|
||||
lat_y = self.vae.encode([enc], VAE_tile_size, any_end_frame= any_end_frame and add_frames_for_end_image)[0]
|
||||
y = torch.concat([msk, lat_y])
|
||||
overlapped_latents_frames_num = int(1 + (preframes_count-1) // 4)
|
||||
if overlapped_latents != None:
|
||||
# if overlapped_latents != None:
|
||||
if overlapped_latents_frames_num > 0:
|
||||
# disabled because looks worse
|
||||
if False and overlapped_latents_frames_num > 1: lat_y[:, :, 1:overlapped_latents_frames_num] = overlapped_latents[:, 1:]
|
||||
if infinitetalk:
|
||||
lat_y = self.vae.encode([input_video], VAE_tile_size)[0]
|
||||
extended_overlapped_latents = lat_y[:, :overlapped_latents_frames_num].clone().unsqueeze(0)
|
||||
y = torch.concat([msk, lat_y])
|
||||
lat_y = None
|
||||
# if control_pre_frames_count != pre_frames_count:
|
||||
|
||||
lat_y = input_video = None
|
||||
kwargs.update({ 'y': y})
|
||||
if not clip_context is None:
|
||||
kwargs.update({'clip_fea': clip_context})
|
||||
@ -813,8 +848,10 @@ class WanAny2V:
|
||||
noise = None
|
||||
|
||||
if extended_overlapped_latents != None:
|
||||
latent_noise_factor = t / 1000
|
||||
latents[:, :, :extended_overlapped_latents.shape[2]] = extended_overlapped_latents * (1.0 - latent_noise_factor) + torch.randn_like(extended_overlapped_latents ) * latent_noise_factor
|
||||
if no_noise_latents_injection:
|
||||
latents[:, :, :extended_overlapped_latents.shape[2]] = extended_overlapped_latents
|
||||
else:
|
||||
latent_noise_factor = t / 1000
|
||||
if vace:
|
||||
overlap_noise_factor = overlap_noise / 1000
|
||||
for zz in z:
|
||||
|
||||
@ -71,7 +71,7 @@ def audio_prepare_single(audio_path, sample_rate=16000, duration = 0):
|
||||
return human_speech_array
|
||||
|
||||
|
||||
def audio_prepare_multi(left_path, right_path, audio_type = "add", sample_rate=16000, duration = 0, pad = 0):
|
||||
def audio_prepare_multi(left_path, right_path, audio_type = "add", sample_rate=16000, duration = 0, pad = 0, min_audio_duration = 0):
|
||||
if not (left_path==None or right_path==None):
|
||||
human_speech_array1 = audio_prepare_single(left_path, duration = duration)
|
||||
human_speech_array2 = audio_prepare_single(right_path, duration = duration)
|
||||
@ -95,6 +95,14 @@ def audio_prepare_multi(left_path, right_path, audio_type = "add", sample_rate=1
|
||||
new_human_speech1 = np.concatenate([np.zeros(pad), new_human_speech1])
|
||||
new_human_speech2 = np.concatenate([np.zeros(pad), new_human_speech2])
|
||||
|
||||
if min_audio_duration > 0:
|
||||
min_samples = math.ceil( min_audio_duration * sample_rate)
|
||||
if len(new_human_speech1) < min_samples:
|
||||
new_human_speech1 = np.concatenate([new_human_speech1, np.zeros(min_samples -len(new_human_speech1)) ])
|
||||
if len(new_human_speech2) < min_samples:
|
||||
new_human_speech2 = np.concatenate([new_human_speech2, np.zeros(min_samples -len(new_human_speech2)) ])
|
||||
|
||||
|
||||
return new_human_speech1, new_human_speech2, sum_human_speechs
|
||||
|
||||
def process_tts_single(text, save_dir, voice1):
|
||||
@ -170,11 +178,11 @@ def process_tts_multi(text, save_dir, voice1, voice2):
|
||||
return s1, s2, save_path_sum
|
||||
|
||||
|
||||
def get_full_audio_embeddings(audio_guide1 = None, audio_guide2 = None, combination_type ="add", num_frames = 0, fps = 25, sr = 16000, padded_frames_for_embeddings = 0):
|
||||
def get_full_audio_embeddings(audio_guide1 = None, audio_guide2 = None, combination_type ="add", num_frames = 0, fps = 25, sr = 16000, padded_frames_for_embeddings = 0, min_audio_duration = 0):
|
||||
wav2vec_feature_extractor, audio_encoder= custom_init('cpu', "ckpts/chinese-wav2vec2-base")
|
||||
# wav2vec_feature_extractor, audio_encoder= custom_init('cpu', "ckpts/wav2vec")
|
||||
pad = int(padded_frames_for_embeddings/ fps * sr)
|
||||
new_human_speech1, new_human_speech2, sum_human_speechs = audio_prepare_multi(audio_guide1, audio_guide2, combination_type, duration= num_frames / fps, pad = pad)
|
||||
new_human_speech1, new_human_speech2, sum_human_speechs = audio_prepare_multi(audio_guide1, audio_guide2, combination_type, duration= num_frames / fps, pad = pad, min_audio_duration = min_audio_duration )
|
||||
audio_embedding_1 = get_embedding(new_human_speech1, wav2vec_feature_extractor, audio_encoder, sr=sr, fps= fps)
|
||||
audio_embedding_2 = get_embedding(new_human_speech2, wav2vec_feature_extractor, audio_encoder, sr=sr, fps= fps)
|
||||
full_audio_embs = []
|
||||
|
||||
@ -2,13 +2,13 @@ import torch
|
||||
import numpy as np
|
||||
|
||||
def test_class_i2v(base_model_type):
|
||||
return base_model_type in ["i2v", "i2v_2_2", "fun_inp_1.3B", "fun_inp", "flf2v_720p", "fantasy", "multitalk", "i2v_2_2_multitalk" ] #"hunyuan_i2v",
|
||||
return base_model_type in ["i2v", "i2v_2_2", "fun_inp_1.3B", "fun_inp", "flf2v_720p", "fantasy", "multitalk", "infinitetalk", "i2v_2_2_multitalk" ]
|
||||
|
||||
def test_class_1_3B(base_model_type):
|
||||
return base_model_type in [ "vace_1.3B", "t2v_1.3B", "recam_1.3B","phantom_1.3B","fun_inp_1.3B"]
|
||||
|
||||
def test_multitalk(base_model_type):
|
||||
return base_model_type in ["multitalk", "vace_multitalk_14B", "i2v_2_2_multitalk"]
|
||||
return base_model_type in ["multitalk", "vace_multitalk_14B", "i2v_2_2_multitalk", "infinitetalk"]
|
||||
|
||||
class family_handler():
|
||||
|
||||
@ -64,17 +64,6 @@ class family_handler():
|
||||
text_encoder_filename = text_encoder_filename.replace("bf16", "quanto_int8")
|
||||
return text_encoder_filename
|
||||
|
||||
|
||||
|
||||
@staticmethod
|
||||
def query_modules_files():
|
||||
return {
|
||||
"vace_14B" : ["ckpts/wan2.1_Vace_14B_module_mbf16.safetensors", "ckpts/wan2.1_Vace_14B_module_quanto_mbf16_int8.safetensors", "ckpts/wan2.1_Vace_14B_module_quanto_mfp16_int8.safetensors"],
|
||||
"vace_1.3B" : ["ckpts/wan2.1_Vace_1_3B_module.safetensors"],
|
||||
"fantasy": ["ckpts/wan2.1_fantasy_speaking_14B_bf16.safetensors"],
|
||||
"multitalk": ["ckpts/wan2.1_multitalk_14B_mbf16.safetensors", "ckpts/wan2.1_multitalk_14B_quanto_mbf16_int8.safetensors", "ckpts/wan2.1_multitalk_14B_quanto_mfp16_int8.safetensors"]
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def query_model_def(base_model_type, model_def):
|
||||
extra_model_def = {}
|
||||
@ -103,7 +92,7 @@ class family_handler():
|
||||
extra_model_def.update({
|
||||
"frames_minimum" : frames_minimum,
|
||||
"frames_steps" : frames_steps,
|
||||
"sliding_window" : base_model_type in ["multitalk", "t2v", "fantasy"] or test_class_i2v(base_model_type) or vace_class, #"ti2v_2_2",
|
||||
"sliding_window" : base_model_type in ["multitalk", "infinitetalk", "t2v", "fantasy"] or test_class_i2v(base_model_type) or vace_class, #"ti2v_2_2",
|
||||
"multiple_submodels" : multiple_submodels,
|
||||
"guidance_max_phases" : 3,
|
||||
"skip_layer_guidance" : True,
|
||||
@ -112,18 +101,22 @@ class family_handler():
|
||||
"adaptive_projected_guidance" : True,
|
||||
"tea_cache" : not (base_model_type in ["i2v_2_2", "ti2v_2_2" ] or multiple_submodels),
|
||||
"mag_cache" : True,
|
||||
"first_ref_is_start_image": base_model_type in ["infinitetalk"],
|
||||
"sample_solvers":[
|
||||
("unipc", "unipc"),
|
||||
("euler", "euler"),
|
||||
("dpm++", "dpm++"),
|
||||
("flowmatch causvid", "causvid"), ]
|
||||
})
|
||||
if base_model_type in ["infinitetalk"]:
|
||||
extra_model_def["no_background_removal"] = True
|
||||
# extra_model_def["at_least_one_image_ref_needed"] = True
|
||||
|
||||
return extra_model_def
|
||||
|
||||
@staticmethod
|
||||
def query_supported_types():
|
||||
return ["multitalk", "fantasy", "vace_14B", "vace_multitalk_14B",
|
||||
return ["multitalk", "infinitetalk", "fantasy", "vace_14B", "vace_multitalk_14B",
|
||||
"t2v_1.3B", "t2v", "vace_1.3B", "phantom_1.3B", "phantom_14B",
|
||||
"recam_1.3B",
|
||||
"i2v", "i2v_2_2", "i2v_2_2_multitalk", "ti2v_2_2", "flf2v_720p", "fun_inp_1.3B", "fun_inp"]
|
||||
@ -250,6 +243,17 @@ class family_handler():
|
||||
"adaptive_switch" : 1,
|
||||
})
|
||||
|
||||
elif base_model_type in ["infinitetalk"]:
|
||||
ui_defaults.update({
|
||||
"guidance_scale": 5.0,
|
||||
"flow_shift": 7, # 11 for 720p
|
||||
"sliding_window_overlap" : 9,
|
||||
"sample_solver" : "euler",
|
||||
"video_prompt_type": "KI",
|
||||
"remove_background_images_ref" : 0,
|
||||
"adaptive_switch" : 1,
|
||||
})
|
||||
|
||||
elif base_model_type in ["phantom_1.3B", "phantom_14B"]:
|
||||
ui_defaults.update({
|
||||
"guidance_scale": 7.5,
|
||||
@ -274,5 +278,14 @@ class family_handler():
|
||||
|
||||
if model_def.get("multiple_submodels", False):
|
||||
ui_defaults["guidance_phases"] = 2
|
||||
|
||||
|
||||
|
||||
@staticmethod
|
||||
def validate_generative_settings(base_model_type, model_def, inputs):
|
||||
if base_model_type in ["infinitetalk"]:
|
||||
video_source = inputs["video_source"]
|
||||
image_refs = inputs["image_refs"]
|
||||
video_prompt_type = inputs["video_prompt_type"]
|
||||
image_prompt_type = inputs["image_prompt_type"]
|
||||
if ("V" in image_prompt_type or "L" in image_prompt_type) and image_refs is None:
|
||||
video_prompt_type = video_prompt_type.replace("I", "").replace("K","")
|
||||
inputs["video_prompt_type"] = video_prompt_type
|
||||
|
||||
@ -138,6 +138,10 @@ def remove_background(img, session=None):
|
||||
img = remove(img, session=session, alpha_matting = True, bgcolor=[255, 255, 255, 0]).convert('RGB')
|
||||
return torch.from_numpy(np.array(img).astype(np.float32) / 255.0).movedim(-1, 0)
|
||||
|
||||
|
||||
def convert_image_to_tensor(image):
|
||||
return torch.from_numpy(np.array(image).astype(np.float32)).div_(127.5).sub_(1.).movedim(-1, 0)
|
||||
|
||||
def convert_tensor_to_image(t, frame_no = -1):
|
||||
t = t[:, frame_no] if frame_no >= 0 else t
|
||||
return Image.fromarray(t.clone().add_(1.).mul_(127.5).permute(1,2,0).to(torch.uint8).cpu().numpy())
|
||||
|
||||
253
wgp.py
253
wgp.py
@ -17,7 +17,7 @@ import numpy as np
|
||||
import importlib
|
||||
from shared.utils import notification_sound
|
||||
from shared.utils.loras_mutipliers import preparse_loras_multipliers, parse_loras_multipliers
|
||||
from shared.utils.utils import convert_tensor_to_image, save_image, get_video_info, get_file_creation_date, convert_image_to_video, calculate_new_dimensions
|
||||
from shared.utils.utils import convert_tensor_to_image, save_image, get_video_info, get_file_creation_date, convert_image_to_video, calculate_new_dimensions, convert_image_to_tensor
|
||||
from shared.utils.audio_video import extract_audio_tracks, combine_video_with_audio_tracks, combine_and_concatenate_video_with_audio_tracks, cleanup_temp_audio_files, save_video, save_image
|
||||
from shared.utils.audio_video import save_image_metadata, read_image_metadata
|
||||
from shared.match_archi import match_nvidia_architecture
|
||||
@ -55,7 +55,7 @@ AUTOSAVE_FILENAME = "queue.zip"
|
||||
PROMPT_VARS_MAX = 10
|
||||
|
||||
target_mmgp_version = "3.5.10"
|
||||
WanGP_version = "8.01"
|
||||
WanGP_version = "8.1"
|
||||
settings_version = 2.25
|
||||
max_source_video_frames = 3000
|
||||
prompt_enhancer_image_caption_model, prompt_enhancer_image_caption_processor, prompt_enhancer_llm_model, prompt_enhancer_llm_tokenizer = None, None, None, None
|
||||
@ -73,6 +73,23 @@ unique_id = 0
|
||||
unique_id_lock = threading.Lock()
|
||||
gen_lock = threading.Lock()
|
||||
offloadobj = enhancer_offloadobj = wan_model = None
|
||||
reload_needed = True
|
||||
|
||||
def release_model():
|
||||
global wan_model, offloadobj, reload_needed
|
||||
wan_model = None
|
||||
if offloadobj is not None:
|
||||
offloadobj.release()
|
||||
offloadobj = None
|
||||
torch.cuda.empty_cache()
|
||||
gc.collect()
|
||||
try:
|
||||
torch._C._host_emptyCache()
|
||||
except:
|
||||
pass
|
||||
reload_needed = True
|
||||
else:
|
||||
gc.collect()
|
||||
|
||||
def get_unique_id():
|
||||
global unique_id
|
||||
@ -180,6 +197,7 @@ def process_prompt_and_add_tasks(state, model_choice):
|
||||
queue = gen.get("queue", [])
|
||||
return get_queue_table(queue)
|
||||
model_def = get_model_def(model_type)
|
||||
model_handler = get_model_handler(model_type)
|
||||
image_outputs = inputs["image_mode"] == 1
|
||||
any_steps_skipping = model_def.get("tea_cache", False) or model_def.get("mag_cache", False)
|
||||
model_type = get_base_model_type(model_type)
|
||||
@ -240,6 +258,11 @@ def process_prompt_and_add_tasks(state, model_choice):
|
||||
queue= gen.get("queue", [])
|
||||
return update_queue_data(queue)
|
||||
|
||||
if hasattr(model_handler, "validate_generative_settings"):
|
||||
error = model_handler.validate_generative_settings(model_type, model_def, inputs)
|
||||
if error is not None and len(error) > 0:
|
||||
gr.Info(error)
|
||||
return
|
||||
if inputs.get("cfg_star_switch", 0) != 0 and inputs.get("apg_switch", 0) != 0:
|
||||
gr.Info("Adaptive Progressive Guidance and Classifier Free Guidance Star can not be set at the same time")
|
||||
return
|
||||
@ -386,20 +409,16 @@ def process_prompt_and_add_tasks(state, model_choice):
|
||||
if not "I" in video_prompt_type and not not "V" in video_prompt_type:
|
||||
gr.Info("To get good results with Multitalk and two people speaking, it is recommended to set a Reference Frame or a Control Video (potentially truncated) that contains the two people one on each side")
|
||||
|
||||
# if len(filter_letters(image_prompt_type, "VL")) > 0 :
|
||||
# if "R" in audio_prompt_type:
|
||||
# gr.Info("Remuxing is not yet supported if there is a video source")
|
||||
# audio_prompt_type= audio_prompt_type.replace("R" ,"")
|
||||
# if "A" in audio_prompt_type:
|
||||
# gr.Info("Creating an Audio track is not yet supported if there is a video source")
|
||||
# return
|
||||
|
||||
if model_type in ["hunyuan_custom", "hunyuan_custom_edit", "hunyuan_audio", "hunyuan_avatar"]:
|
||||
if model_def.get("one_image_ref_needed", False):
|
||||
if image_refs == None :
|
||||
gr.Info("You must provide an Image Reference")
|
||||
return
|
||||
if len(image_refs) > 1:
|
||||
gr.Info("Only one Image Reference (a person) is supported for the moment by Hunyuan Custom / Avatar")
|
||||
gr.Info("Only one Image Reference (a person) is supported for the moment by this model")
|
||||
return
|
||||
if model_def.get("at_least_one_image_ref_needed", False):
|
||||
if image_refs == None :
|
||||
gr.Info("You must provide at least one Image Reference")
|
||||
return
|
||||
|
||||
if "I" in video_prompt_type:
|
||||
@ -1903,15 +1922,24 @@ def get_model_name(model_type, description_container = [""]):
|
||||
def get_model_record(model_name):
|
||||
return f"WanGP v{WanGP_version} by DeepBeepMeep - " + model_name
|
||||
|
||||
def get_model_recursive_prop(model_type, prop = "URLs", return_list = True, stack= []):
|
||||
def get_model_recursive_prop(model_type, prop = "URLs", sub_prop_name = None, return_list = True, stack= []):
|
||||
model_def = models_def.get(model_type, None)
|
||||
if model_def != None:
|
||||
prop_value = model_def.get(prop, None)
|
||||
if prop_value == None:
|
||||
return []
|
||||
if sub_prop_name is not None:
|
||||
if sub_prop_name == "_list":
|
||||
if not isinstance(prop_value,list) or len(prop_value) != 1:
|
||||
raise Exception(f"Sub property value for property {prop} of model type {model_type} should be a list of size 1")
|
||||
prop_value = prop_value[0]
|
||||
else:
|
||||
if not isinstance(prop_value,dict) and not sub_prop_name in prop_value:
|
||||
raise Exception(f"Invalid sub property value {sub_prop_name} for property {prop} of model type {model_type}")
|
||||
prop_value = prop_value[sub_prop_name]
|
||||
if isinstance(prop_value, str):
|
||||
if len(stack) > 10: raise Exception(f"Circular Reference in Model {prop} dependencies: {stack}")
|
||||
return get_model_recursive_prop(prop_value, prop = prop, stack = stack + [prop_value] )
|
||||
return get_model_recursive_prop(prop_value, prop = prop, sub_prop_name =sub_prop_name, stack = stack + [prop_value] )
|
||||
else:
|
||||
return prop_value
|
||||
else:
|
||||
@ -1924,10 +1952,21 @@ def get_model_recursive_prop(model_type, prop = "URLs", return_list = True, sta
|
||||
def get_model_filename(model_type, quantization ="int8", dtype_policy = "", module_type = None, submodel_no = 1, stack=[]):
|
||||
if module_type is not None:
|
||||
base_model_type = get_base_model_type(model_type)
|
||||
model_type_handler = model_types_handlers[base_model_type]
|
||||
modules_files = model_type_handler.query_modules_files() if hasattr(model_type_handler, "query_modules_files") else {}
|
||||
choices = modules_files.get(module_type, None)
|
||||
if choices == None: raise Exception(f"Invalid Module Id '{module_type}'")
|
||||
# model_type_handler = model_types_handlers[base_model_type]
|
||||
# modules_files = model_type_handler.query_modules_files() if hasattr(model_type_handler, "query_modules_files") else {}
|
||||
if isinstance(module_type, list):
|
||||
URLs = module_type
|
||||
else:
|
||||
if "#" not in module_type:
|
||||
sub_prop_name = "_list"
|
||||
else:
|
||||
pos = module_type.rfind("#")
|
||||
sub_prop_name = module_type[pos+1:]
|
||||
module_type = module_type[:pos]
|
||||
URLs = get_model_recursive_prop(module_type, "modules", sub_prop_name =sub_prop_name, return_list= False)
|
||||
|
||||
# choices = modules_files.get(module_type, None)
|
||||
# if choices == None: raise Exception(f"Invalid Module Id '{module_type}'")
|
||||
else:
|
||||
key_name = "URLs" if submodel_no <= 1 else f"URLs{submodel_no}"
|
||||
|
||||
@ -1937,8 +1976,8 @@ def get_model_filename(model_type, quantization ="int8", dtype_policy = "", modu
|
||||
if isinstance(URLs, str):
|
||||
if len(stack) > 10: raise Exception(f"Circular Reference in Model {key_name} dependencies: {stack}")
|
||||
return get_model_filename(URLs, quantization=quantization, dtype_policy=dtype_policy, submodel_no = submodel_no, stack = stack + [URLs])
|
||||
else:
|
||||
choices = [ ("ckpts/" + os.path.basename(path) if path.startswith("http") else path) for path in URLs ]
|
||||
|
||||
choices = [ ("ckpts/" + os.path.basename(path) if path.startswith("http") else path) for path in URLs ]
|
||||
if len(quantization) == 0:
|
||||
quantization = "bf16"
|
||||
|
||||
@ -2238,21 +2277,35 @@ if args.compile: #args.fastest or
|
||||
lock_ui_compile = True
|
||||
|
||||
|
||||
def save_model(model, model_type, dtype, config_file, submodel_no = 1):
|
||||
def save_model(model, model_type, dtype, config_file, submodel_no = 1, is_module = False, filter = None, no_fp16_main_model = True ):
|
||||
model_def = get_model_def(model_type)
|
||||
# To save module and quantized modules
|
||||
# 1) set Transformer Model Quantization Type to 16 bits
|
||||
# 2) insert in def module_source : path and "model_fp16.safetensors in URLs"
|
||||
# 3) Generate (only quantized fp16 will be created)
|
||||
# 4) replace in def module_source : path and "model_bf16.safetensors in URLs"
|
||||
# 5) Generate (both bf16 and quantized bf16 will be created)
|
||||
if model_def == None: return
|
||||
url_key = "URLs" if submodel_no <=1 else "URLs" + str(submodel_no)
|
||||
if is_module:
|
||||
url_key = "modules"
|
||||
source_key = "module_source"
|
||||
else:
|
||||
url_key = "URLs" if submodel_no <=1 else "URLs" + str(submodel_no)
|
||||
source_key = "source"
|
||||
URLs= model_def.get(url_key, None)
|
||||
if URLs is None: return
|
||||
if isinstance(URLs, str):
|
||||
print("Unable to save model for a finetune that references external files")
|
||||
return
|
||||
from mmgp import offload
|
||||
if dtype == torch.bfloat16:
|
||||
dtypestr= "bf16"
|
||||
else:
|
||||
dtypestr= "fp16"
|
||||
from mmgp import offload
|
||||
dtypestr= "bf16" if dtype == torch.bfloat16 else "fp16"
|
||||
if no_fp16_main_model: dtypestr = dtypestr.replace("fp16", "bf16")
|
||||
model_filename = None
|
||||
if is_module:
|
||||
if not isinstance(URLs,list) or len(URLs) != 1:
|
||||
print("Target Module files are missing")
|
||||
return
|
||||
URLs= URLs[0]
|
||||
for url in URLs:
|
||||
if "quanto" not in url and dtypestr in url:
|
||||
model_filename = os.path.basename(url)
|
||||
@ -2260,17 +2313,38 @@ def save_model(model, model_type, dtype, config_file, submodel_no = 1):
|
||||
if model_filename is None:
|
||||
print(f"No target filename with bf16 or fp16 in its name is mentioned in {url_key}")
|
||||
return
|
||||
if not os.path.isfile(model_filename):
|
||||
offload.save_model(model, os.path.join("ckpts",model_filename), config_file_path=config_file)
|
||||
|
||||
finetune_file = os.path.join(os.path.dirname(model_def["path"]) , model_type + ".json")
|
||||
with open(finetune_file, 'r', encoding='utf-8') as reader:
|
||||
saved_finetune_def = json.load(reader)
|
||||
|
||||
update_model_def = False
|
||||
model_filename = os.path.join("ckpts",model_filename)
|
||||
quanto_dtypestr= "bf16" if dtype == torch.bfloat16 else "fp16"
|
||||
if ("m" + dtypestr) in model_filename:
|
||||
dtypestr = "m" + dtypestr
|
||||
quanto_dtypestr = "m" + quanto_dtypestr
|
||||
if not os.path.isfile(model_filename) and (not no_fp16_main_model or dtype == torch.bfloat16):
|
||||
offload.save_model(model, model_filename, config_file_path=config_file, filter_sd=filter)
|
||||
print(f"New model file '{model_filename}' had been created for finetune Id '{model_type}'.")
|
||||
finetune_file = os.path.join(os.path.dirname(model_def["path"]) , model_type + ".json")
|
||||
with open(finetune_file, 'r', encoding='utf-8') as reader:
|
||||
saved_finetune_def = json.load(reader)
|
||||
del saved_finetune_def["model"]["source"]
|
||||
del model_def["source"]
|
||||
del saved_finetune_def["model"][source_key]
|
||||
del model_def[source_key]
|
||||
print(f"The 'source' entry has been removed in the '{finetune_file}' definition file.")
|
||||
update_model_def = True
|
||||
|
||||
if is_module:
|
||||
quanto_filename = model_filename.replace(dtypestr, "quanto_" + quanto_dtypestr + "_int8" )
|
||||
if hasattr(model, "_quanto_map"):
|
||||
print("unable to generate quantized module, the main model should at full 16 bits before quantization can be done")
|
||||
elif not os.path.isfile(quanto_filename):
|
||||
offload.save_model(model, quanto_filename, config_file_path=config_file, do_quantize= True, filter_sd=filter)
|
||||
print(f"New quantized file '{quanto_filename}' had been created for finetune Id '{model_type}'.")
|
||||
model_def[url_key][0].append(quanto_filename)
|
||||
saved_finetune_def["model"][url_key][0].append(quanto_filename)
|
||||
update_model_def = True
|
||||
if update_model_def:
|
||||
with open(finetune_file, "w", encoding="utf-8") as writer:
|
||||
writer.write(json.dumps(saved_finetune_def, indent=4))
|
||||
print(f"The 'source' entry has been removed in the '{finetune_file}' definition file.")
|
||||
|
||||
def save_quantized_model(model, model_type, model_filename, dtype, config_file, submodel_no = 1):
|
||||
if "quanto" in model_filename: return
|
||||
@ -2414,18 +2488,19 @@ def download_models(model_filename = None, model_type= None, module_type = None,
|
||||
model_def = get_model_def(model_type)
|
||||
|
||||
source = model_def.get("source", None)
|
||||
module_source = model_def.get("module_source", None)
|
||||
model_type_handler = model_types_handlers[base_model_type]
|
||||
|
||||
key_name = "URLs" if submodel_no <= 1 else f"URLs{submodel_no}"
|
||||
if source is not None:
|
||||
if source is not None and module_type is None or module_source is not None and module_type is not None:
|
||||
model_filename = None
|
||||
elif module_type is not None:
|
||||
modules_files = model_type_handler.query_modules_files() if hasattr(model_type_handler, "query_modules_files") else {}
|
||||
if module_type not in modules_files:
|
||||
raise Exception(f"Unknown module {model_type} for model type {model_type}")
|
||||
else:
|
||||
if not os.path.isfile(model_filename):
|
||||
URLs = get_model_recursive_prop(model_type, key_name, return_list= False)
|
||||
if module_type is not None:
|
||||
key_name = "modules"
|
||||
URLs = module_type
|
||||
else:
|
||||
key_name = "URLs" if submodel_no <= 1 else f"URLs{submodel_no}"
|
||||
URLs = get_model_recursive_prop(model_type, key_name, return_list= False)
|
||||
if isinstance(URLs, str):
|
||||
raise Exception("Missing model " + URLs)
|
||||
use_url = model_filename
|
||||
@ -2761,6 +2836,13 @@ def generate_header(model_type, compile, attention_mode):
|
||||
|
||||
return header
|
||||
|
||||
def release_RAM():
|
||||
if gen_in_progress:
|
||||
gr.Info("Unable to release RAM when a Generation is in Progress")
|
||||
else:
|
||||
release_model()
|
||||
gr.Info("Models stored in RAM have been released")
|
||||
|
||||
def apply_changes( state,
|
||||
transformer_types_choices,
|
||||
transformer_dtype_policy_choice,
|
||||
@ -2997,9 +3079,8 @@ def refresh_gallery(state): #, msg
|
||||
choice = gen.get("selected",0)
|
||||
header_text = gen.get("header_text", "")
|
||||
in_progress = "in_progress" in gen
|
||||
if in_progress:
|
||||
if gen.get("last_selected", True):
|
||||
choice = max(len(file_list) - 1,0)
|
||||
if gen.get("last_selected", True) and file_list is not None:
|
||||
choice = max(len(file_list) - 1,0)
|
||||
|
||||
queue = gen.get("queue", [])
|
||||
abort_interactive = not gen.get("abort", False)
|
||||
@ -4270,10 +4351,7 @@ def generate_video(
|
||||
|
||||
if model_type != transformer_type or reload_needed or override_profile>0 and override_profile != loaded_profile or override_profile<0 and default_profile != loaded_profile:
|
||||
wan_model = None
|
||||
if offloadobj is not None:
|
||||
offloadobj.release()
|
||||
offloadobj = None
|
||||
gc.collect()
|
||||
release_model()
|
||||
send_cmd("status", f"Loading model {get_model_name(model_type)}...")
|
||||
wan_model, offloadobj = load_models(model_type, override_profile)
|
||||
send_cmd("status", "Model loaded")
|
||||
@ -4385,6 +4463,11 @@ def generate_video(
|
||||
control_audio_tracks, _ = extract_audio_tracks(video_guide)
|
||||
if video_source is not None:
|
||||
source_audio_tracks, source_audio_metadata = extract_audio_tracks(video_source)
|
||||
video_fps, _, _, video_frames_count = get_video_info(video_source)
|
||||
video_source_duration = video_frames_count / video_fps
|
||||
else:
|
||||
video_source_duration = 0
|
||||
|
||||
reset_control_aligment = "T" in video_prompt_type
|
||||
|
||||
if test_any_sliding_window(model_type) :
|
||||
@ -4424,7 +4507,7 @@ def generate_video(
|
||||
send_cmd("progress", [0, get_latest_status(state, "Removing Images References Background")])
|
||||
os.environ["U2NET_HOME"] = os.path.join(os.getcwd(), "ckpts", "rembg")
|
||||
from shared.utils.utils import resize_and_remove_background
|
||||
image_refs[nb_frames_positions:] = resize_and_remove_background(image_refs[nb_frames_positions:] , width, height, remove_background_images_ref > 0, any_background_ref, fit_into_canvas= not (vace or hunyuan_avatar or flux or qwen) ) # no fit for vace ref images as it is done later
|
||||
image_refs[nb_frames_positions:] = resize_and_remove_background(image_refs[nb_frames_positions:] , width, height, remove_background_images_ref > 0, any_background_ref, fit_into_canvas= False if (vace or hunyuan_avatar or flux or qwen) else fit_canvas ) # no fit for vace ref images as it is done later
|
||||
update_task_thumbnails(task, locals())
|
||||
send_cmd("output")
|
||||
joint_pass = boost ==1 #and profile != 1 and profile != 3
|
||||
@ -4475,6 +4558,7 @@ def generate_video(
|
||||
audio_guide, audio_guide2 = get_available_filename(save_path, audio_guide, "_tmp1", ".wav"), get_available_filename(save_path, audio_guide, "_tmp2", ".wav")
|
||||
extract_dual_audio(original_audio_guide, audio_guide, audio_guide2 )
|
||||
output_new_audio_filepath = original_audio_guide
|
||||
|
||||
current_video_length = min(int(fps * duration //latent_size) * latent_size + latent_size + 1, current_video_length)
|
||||
if fantasy:
|
||||
# audio_proj_split_full, audio_context_lens_full = parse_audio(audio_guide, num_frames= max_source_video_frames, fps= fps, padded_frames_for_embeddings= (reuse_frames if reset_control_aligment else 0), device= processing_device )
|
||||
@ -4482,7 +4566,8 @@ def generate_video(
|
||||
elif multitalk:
|
||||
from models.wan.multitalk.multitalk import get_full_audio_embeddings
|
||||
# pad audio_proj_full if aligned to beginning of window to simulate source window overlap
|
||||
audio_proj_full, output_new_audio_data = get_full_audio_embeddings(audio_guide1 = audio_guide, audio_guide2= audio_guide2, combination_type= combination_type , num_frames= max_source_video_frames, sr= audio_sampling_rate, fps =fps, padded_frames_for_embeddings = (reuse_frames if reset_control_aligment else 0))
|
||||
min_audio_duration = current_video_length/fps if reset_control_aligment else video_source_duration + current_video_length/fps
|
||||
audio_proj_full, output_new_audio_data = get_full_audio_embeddings(audio_guide1 = audio_guide, audio_guide2= audio_guide2, combination_type= combination_type , num_frames= max_source_video_frames, sr= audio_sampling_rate, fps =fps, padded_frames_for_embeddings = (reuse_frames if reset_control_aligment else 0), min_audio_duration = min_audio_duration)
|
||||
if output_new_audio_filepath is not None: output_new_audio_data = None
|
||||
if not args.save_speakers and "X" in audio_prompt_type:
|
||||
os.remove(audio_guide)
|
||||
@ -4532,7 +4617,7 @@ def generate_video(
|
||||
repeat_no +=1
|
||||
gen["repeat_no"] = repeat_no
|
||||
src_video, src_mask, src_ref_images = None, None, None
|
||||
prefix_video = None
|
||||
prefix_video = pre_video_frame = None
|
||||
source_video_overlap_frames_count = 0 # number of frames overalapped in source video for first window
|
||||
source_video_frames_count = 0 # number of frames to use in source video (processing starts source_video_overlap_frames_count frames before )
|
||||
frames_already_processed = None
|
||||
@ -4592,13 +4677,13 @@ def generate_video(
|
||||
image_start_tensor = image_end_tensor = None
|
||||
if window_no == 1 and (video_source is not None or image_start is not None):
|
||||
if image_start is not None:
|
||||
new_height, new_width = calculate_new_dimensions(height, width, image_start.height, image_start.width, fit_canvas, 32)
|
||||
new_height, new_width = calculate_new_dimensions(height, width, image_start.height, image_start.width, sample_fit_canvas, block_size = block_size)
|
||||
image_start_tensor = image_start.resize((new_width, new_height), resample=Image.Resampling.LANCZOS)
|
||||
image_start_tensor = torch.from_numpy(np.array(image_start_tensor).astype(np.float32)).div_(127.5).sub_(1.).movedim(-1, 0)
|
||||
image_start_tensor = convert_image_to_tensor(image_start_tensor)
|
||||
pre_video_guide = prefix_video = image_start_tensor.unsqueeze(1)
|
||||
if image_end is not None:
|
||||
image_end_tensor = image_end.resize((new_width, new_height), resample=Image.Resampling.LANCZOS)
|
||||
image_end_tensor = torch.from_numpy(np.array(image_end_tensor).astype(np.float32)).div_(127.5).sub_(1.).movedim(-1, 0)
|
||||
image_end_tensor = convert_image_to_tensor(image_end_tensor)
|
||||
else:
|
||||
if "L" in image_prompt_type:
|
||||
from shared.utils.utils import get_video_frame
|
||||
@ -4607,6 +4692,7 @@ def generate_video(
|
||||
prefix_video = prefix_video.permute(3, 0, 1, 2)
|
||||
prefix_video = prefix_video.float().div_(127.5).sub_(1.) # c, f, h, w
|
||||
pre_video_guide = prefix_video[:, -reuse_frames:]
|
||||
pre_video_frame = convert_tensor_to_image(prefix_video[:, -1])
|
||||
source_video_overlap_frames_count = pre_video_guide.shape[1]
|
||||
source_video_frames_count = prefix_video.shape[1]
|
||||
if sample_fit_canvas != None: image_size = pre_video_guide.shape[-2:]
|
||||
@ -4829,6 +4915,7 @@ def generate_video(
|
||||
window_no = window_no,
|
||||
offloadobj = offloadobj,
|
||||
set_header_text= set_header_text,
|
||||
pre_video_frame = pre_video_frame,
|
||||
)
|
||||
except Exception as e:
|
||||
if len(control_audio_tracks) > 0 or len(source_audio_tracks) > 0:
|
||||
@ -6380,10 +6467,7 @@ def preload_model_when_switching(state):
|
||||
model_type = state["model_type"]
|
||||
if model_type != transformer_type:
|
||||
wan_model = None
|
||||
if offloadobj is not None:
|
||||
offloadobj.release()
|
||||
offloadobj = None
|
||||
gc.collect()
|
||||
release_model()
|
||||
model_filename = get_model_name(model_type)
|
||||
yield f"Loading model {model_filename}..."
|
||||
wan_model, offloadobj = load_models(model_type)
|
||||
@ -6393,15 +6477,11 @@ def preload_model_when_switching(state):
|
||||
return gr.Text()
|
||||
|
||||
def unload_model_if_needed(state):
|
||||
global reload_needed, wan_model, offloadobj
|
||||
global wan_model
|
||||
if "U" in preload_model_policy:
|
||||
if wan_model != None:
|
||||
wan_model = None
|
||||
if offloadobj is not None:
|
||||
offloadobj.release()
|
||||
offloadobj = None
|
||||
gc.collect()
|
||||
reload_needed= True
|
||||
release_model()
|
||||
|
||||
def all_letters(source_str, letters):
|
||||
for letter in letters:
|
||||
@ -6825,7 +6905,8 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non
|
||||
vace = test_vace_module(base_model_type)
|
||||
phantom = base_model_type in ["phantom_1.3B", "phantom_14B"]
|
||||
fantasy = base_model_type in ["fantasy"]
|
||||
multitalk = model_def.get("multitalk_class", False)
|
||||
multitalk = model_def.get("multitalk_class", False)
|
||||
infinitetalk = base_model_type in ["infinitetalk"]
|
||||
hunyuan_t2v = "hunyuan_video_720" in model_filename
|
||||
hunyuan_i2v = "hunyuan_video_i2v" in model_filename
|
||||
hunyuan_video_custom = "hunyuan_video_custom" in model_filename
|
||||
@ -6864,7 +6945,7 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non
|
||||
|
||||
|
||||
with gr.Column(visible= test_class_i2v(model_type) or hunyuan_i2v or diffusion_forcing or ltxv or recammaster or vace or ti2v_2_2) as image_prompt_column:
|
||||
if vace:
|
||||
if vace or infinitetalk:
|
||||
image_prompt_type_value= ui_defaults.get("image_prompt_type","")
|
||||
image_prompt_type_value = "" if image_prompt_type_value == "S" else image_prompt_type_value
|
||||
image_prompt_type = gr.Radio( [("New Video", ""),("Continue Video File", "V"),("Continue Last Video", "L")], value =image_prompt_type_value, label="Source Video", show_label= False, visible= not image_outputs , scale= 3)
|
||||
@ -6967,7 +7048,7 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non
|
||||
model_mode = gr.Dropdown(value=None, visible=False)
|
||||
keep_frames_video_source = gr.Text(visible=False)
|
||||
|
||||
with gr.Column(visible= vace or phantom or hunyuan_video_custom or hunyuan_video_avatar or hunyuan_video_custom_edit or t2v or ltxv or flux and model_reference_image or qwen and model_reference_image) as video_prompt_column:
|
||||
with gr.Column(visible= vace or phantom or hunyuan_video_custom or hunyuan_video_avatar or hunyuan_video_custom_edit or t2v or ltxv or infinitetalk or flux and model_reference_image or qwen and model_reference_image) as video_prompt_column:
|
||||
video_prompt_type_value= ui_defaults.get("video_prompt_type","")
|
||||
video_prompt_type = gr.Text(value= video_prompt_type_value, visible= False)
|
||||
any_control_video = True
|
||||
@ -7107,8 +7188,8 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non
|
||||
)
|
||||
else:
|
||||
video_prompt_type_image_refs = gr.Dropdown(
|
||||
choices=[ ("Start / Ref Image", "I")],
|
||||
value="I",
|
||||
choices=[ ("Start", "KI"),("Ref Image", "I")],
|
||||
value="KI" if model_def.get("first_ref_is_start_image", False) else "I",
|
||||
visible = False,
|
||||
label="Start / Reference Images", scale = 2
|
||||
)
|
||||
@ -7135,8 +7216,8 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non
|
||||
video_mask = gr.Video(label= "Video Mask Area (for Inpainting, white = Control Area, black = Unchanged)", visible= (not image_outputs) and "V" in video_prompt_type_value and "A" in video_prompt_type_value and not "U" in video_prompt_type_value , value= ui_defaults.get("video_mask", None))
|
||||
|
||||
mask_expand = gr.Slider(-10, 50, value=ui_defaults.get("mask_expand", 0), step=1, label="Expand / Shrink Mask Area", visible= "V" in video_prompt_type_value and "A" in video_prompt_type_value and not "U" in video_prompt_type_value )
|
||||
any_reference_image = vace or phantom or hunyuan_video_custom or hunyuan_video_avatar or (flux or qwen) and model_reference_image
|
||||
image_refs = gr.Gallery(preview= True, label ="Start Image" if hunyuan_video_avatar else "Reference Images",
|
||||
any_reference_image = vace or phantom or hunyuan_video_custom or hunyuan_video_avatar or infinitetalk or (flux or qwen) and model_reference_image
|
||||
image_refs = gr.Gallery(preview= True, label ="Start Image" if hunyuan_video_avatar else "Reference Images" + (" (each Image will start a new Clip)" if infinitetalk else ""),
|
||||
type ="pil", show_label= True,
|
||||
columns=[3], rows=[1], object_fit="contain", height="auto", selected_index=0, interactive= True, visible= "I" in video_prompt_type_value,
|
||||
value= ui_defaults.get("image_refs", None),
|
||||
@ -7149,21 +7230,27 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non
|
||||
("Remove Backgrounds only behind People / Objects except main Subject / Landscape" if (flux or qwen) else "Remove Backgrounds only behind People / Objects" , 1),
|
||||
],
|
||||
value=ui_defaults.get("remove_background_images_ref",1),
|
||||
label="Automatic Removal of Background of People or Objects (Only)", scale = 3, visible= "I" in video_prompt_type_value and not hunyuan_video_avatar
|
||||
label="Automatic Removal of Background of People or Objects (Only)", scale = 3, visible= "I" in video_prompt_type_value and not model_def.get("no_background_removal", False)
|
||||
)
|
||||
|
||||
any_audio_voices_support = any_audio_track(base_model_type)
|
||||
audio_prompt_type_value = ui_defaults.get("audio_prompt_type", "A" if any_audio_voices_support else "")
|
||||
audio_prompt_type = gr.Text(value= audio_prompt_type_value, visible= False)
|
||||
if any_audio_voices_support:
|
||||
audio_prompt_type_sources = gr.Dropdown(
|
||||
choices=[
|
||||
("None", ""),
|
||||
("One Person Speaking Only", "A"),
|
||||
any_single_speaker = not model_def.get("multi_speakers_only", False)
|
||||
if not any_single_speaker and "A" in audio_prompt_type_value and not ("B" in audio_prompt_type_value or "X" in audio_prompt_type_value): audio_prompt_type_value = del_in_sequence(audio_prompt_type_value, "XCPAB")
|
||||
any_multi_speakers = not model_def.get("one_speaker_only", False)
|
||||
if not any_multi_speakers: audio_prompt_type_value = del_in_sequence(audio_prompt_type_value, "XCPB")
|
||||
|
||||
speaker_choices=[("None", "")]
|
||||
if any_single_speaker: speaker_choices += [("One Person Speaking Only", "A")]
|
||||
if any_multi_speakers:speaker_choices += [
|
||||
("Two speakers, Auto Separation of Speakers (will work only if there is little background noise)", "XA"),
|
||||
("Two speakers, Speakers Audio sources are assumed to be played in a Row", "CAB"),
|
||||
("Two speakers, Speakers Audio sources are assumed to be played in Parallel", "PAB"),
|
||||
],
|
||||
("Two speakers, Speakers Audio sources are assumed to be played in Parallel", "PAB")
|
||||
]
|
||||
audio_prompt_type_sources = gr.Dropdown(
|
||||
choices=speaker_choices,
|
||||
value= filter_letters(audio_prompt_type_value, "XCPAB"),
|
||||
label="Voices", scale = 3, visible = multitalk and not image_outputs
|
||||
)
|
||||
@ -7560,7 +7647,7 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non
|
||||
],
|
||||
value=filter_letters(video_prompt_type_value, "T"),
|
||||
label="Control Video / Control Audio temporal alignment when any Source Video",
|
||||
visible = vace or ltxv or t2v
|
||||
visible = vace or ltxv or t2v or infinitetalk
|
||||
)
|
||||
|
||||
multi_prompts_gen_type = gr.Dropdown(
|
||||
@ -8320,6 +8407,10 @@ def generate_configuration_tab(state, blocks, header, model_family, model_choice
|
||||
label="Profile (for power users only, not needed to change it)"
|
||||
)
|
||||
preload_in_VRAM_choice = gr.Slider(0, 40000, value=server_config.get("preload_in_VRAM", 0), step=100, label="Number of MB of Models that are Preloaded in VRAM (0 will use Profile default)")
|
||||
|
||||
release_RAM_btn = gr.Button("Force Release RAM")
|
||||
release_RAM_btn.click(fn=release_RAM)
|
||||
|
||||
with gr.Tab("Extensions"):
|
||||
enhancer_enabled_choice = gr.Dropdown(
|
||||
choices=[
|
||||
@ -8418,7 +8509,7 @@ def generate_configuration_tab(state, blocks, header, model_family, model_choice
|
||||
|
||||
|
||||
|
||||
msg = gr.Markdown()
|
||||
msg = gr.Markdown()
|
||||
apply_btn = gr.Button("Apply Changes")
|
||||
apply_btn.click(
|
||||
fn=apply_changes,
|
||||
|
||||
Loading…
Reference in New Issue
Block a user