diff --git a/README.md b/README.md index d6ba630..0833317 100644 --- a/README.md +++ b/README.md @@ -20,6 +20,15 @@ WanGP supports the Wan (and derived models), Hunyuan Video and LTV Video models **Follow DeepBeepMeep on Twitter/X to get the Latest News**: https://x.com/deepbeepmeep ## 🔥 Latest Updates : +### July 30 2025: WanGP v7.5: Just another release ... Wan 2.2 part 2 +Here is now Wan 2.2 image2video a very good model if you want to set Start and End frames. Two Wan 2.2 models delivered, only one to go ... + +Please note that although it is an image2video model it is structurally very close to Wan 2.2 text2video (same layers with only a different initial projection). Given that Wan 2.1 image2video loras don't work too well (half of their tensors are not supported), I have decided that this model will look for its loras in the text2video loras folder instead of the image2video foler. + +I have also optimized RAM management with Wan 2.2 so that loras and modules will be loaded only once in RAM and Reserved RAM, this saves up 5 GB of RAM which can make a difference... + +And this time I really removed Vace Cocktail Light which gave a blurry vision. + ### July 29 2025: WanGP v7.4: Just another release ... Wan 2.2 Preview Wan 2.2 is here. The good news is that WanGP wont require a single byte of extra VRAM to run it and it will be as fast as Wan 2.1. The bad news is that you will need much more RAM if you want to leverage entirely this new model since it has twice has many parameters. diff --git a/configs/i2v_720p.json b/configs/i2v_2_2.json similarity index 75% rename from configs/i2v_720p.json rename to configs/i2v_2_2.json index f5a12b2..a64a868 100644 --- a/configs/i2v_720p.json +++ b/configs/i2v_2_2.json @@ -1,14 +1,14 @@ { "_class_name": "WanModel", - "_diffusers_version": "0.30.0", + "_diffusers_version": "0.33.0", "dim": 5120, "eps": 1e-06, "ffn_dim": 13824, "freq_dim": 256, "in_dim": 36, - "model_type": "i2v", + "model_type": "i2v2_2", "num_heads": 40, "num_layers": 40, "out_dim": 16, "text_len": 512 -} +} \ No newline at end of file diff --git a/defaults/i2v_2_2.json b/defaults/i2v_2_2.json new file mode 100644 index 0000000..0950996 --- /dev/null +++ b/defaults/i2v_2_2.json @@ -0,0 +1,24 @@ +{ + "model": + { + "name": "Wan2.2 Image2video 14B", + "architecture" : "i2v_2_2", + "description": "Wan 2.2 Image 2 Video model. Contrary to the Wan Image2video 2.1 this model is structurally close to the t2v model. You will need consequently to store Loras for this model in the t2v Lora Folder.", + "URLs": [ + "https://huggingface.co/DeepBeepMeep/Wan2.2/resolve/main/wan2.2_image2video_14B_high_mbf16.safetensors", + "https://huggingface.co/DeepBeepMeep/Wan2.2/resolve/main/wan2.2_image2video_14B_high_quanto_mbf16_int8.safetensors", + "https://huggingface.co/DeepBeepMeep/Wan2.2/resolve/main/wan2.2_image2video_14B_high_quanto_mfp16_int8.safetensors" + ], + "URLs2": [ + "https://huggingface.co/DeepBeepMeep/Wan2.2/resolve/main/wan2.2_image2video_14B_low_mbf16.safetensors", + "https://huggingface.co/DeepBeepMeep/Wan2.2/resolve/main/wan2.2_image2video_14B_low_quanto_mbf16_int8.safetensors", + "https://huggingface.co/DeepBeepMeep/Wan2.2/resolve/main/wan2.2_image2video_14B_low_quanto_mfp16_int8.safetensors" + ], + "group": "wan2_2" + }, + "switch_threshold" : 900, + "guidance_scale" : 3.5, + "guidance2_scale" : 3.5, + "flow_shift" : 5 + +} \ No newline at end of file diff --git a/defaults/vace_14B_cocktail_2_2_light.json b/defaults/vace_14B_cocktail_2_2_light.json deleted file mode 100644 index 34dee9a..0000000 --- a/defaults/vace_14B_cocktail_2_2_light.json +++ /dev/null @@ -1,18 +0,0 @@ -{ - "model": { - "name": "Wan2.2 Vace Cocktail Light 14B", - "architecture": "vace_14B", - "modules": [ - "vace_14B" - ], - "description": "This model has been created on the fly using the Wan text 2.2 video model and the Loras of FusioniX. The weight of the Detail Enhancer Lora has been reduced to improve identity preservation. Only the high noise part of the v2.2 model is used to reduce RAM usage.", - "URLs": "t2v_2_2", - "loras": "vace_14B_cocktail_2_2", - "loras_multipliers": "vace_14B_cocktail_2_2", - "group": "wan2_2" - }, - "num_inference_steps": 10, - "guidance_scale": 1, - "guidance2_scale": 1, - "flow_shift": 2 -} \ No newline at end of file diff --git a/flux/sampling.py b/flux/sampling.py index 7f14b09..a8f9aae 100644 --- a/flux/sampling.py +++ b/flux/sampling.py @@ -227,7 +227,7 @@ def prepare_kontext( img_cond_seq = None img_cond_seq_ids = None - + if img_cond_list == None: img_cond_list = [] for cond_no, img_cond in enumerate(img_cond_list): width, height = img_cond.size aspect_ratio = width / height diff --git a/wan/any2video.py b/wan/any2video.py index 816b488..c0e5827 100644 --- a/wan/any2video.py +++ b/wan/any2video.py @@ -88,7 +88,8 @@ class WanAny2V: tokenizer_path=os.path.join(checkpoint_dir, config.t5_tokenizer), shard_fn= None) - if hasattr(config, "clip_checkpoint"): + # base_model_type = "i2v2_2" + if hasattr(config, "clip_checkpoint") and not base_model_type in ["i2v_2_2"]: self.clip = CLIPModel( dtype=config.clip_dtype, device=self.device, @@ -108,30 +109,35 @@ class WanAny2V: # with open(config_filename, 'r', encoding='utf-8') as f: # config = json.load(f) # sd = safetensors2.torch_load_file(xmodel_filename) - # model_filename = "c:/temp/wan2.2t2v/high/diffusion_pytorch_model-00001-of-00006.safetensors" + # model_filename = "c:/temp/wan2.2i2v/low/diffusion_pytorch_model-00001-of-00006.safetensors" base_config_file = f"configs/{base_model_type}.json" forcedConfigPath = base_config_file if len(model_filename) > 1 else None # forcedConfigPath = base_config_file = f"configs/flf2v_720p.json" # model_filename[1] = xmodel_filename - model_filename2 = None - if self.transformer_switch: - model_filename2 = model_filename[1:] - model_filename = model_filename[:1] + model_filename[2:] - self.model = offload.fast_load_transformers_model(model_filename, modelClass=WanModel,do_quantize= quantizeTransformer and not save_quantized, writable_tensors= False, defaultConfigPath=base_config_file , forcedConfigPath= forcedConfigPath) - if model_filename2 is not None: - self.model2 = offload.fast_load_transformers_model(model_filename2, modelClass=WanModel,do_quantize= quantizeTransformer and not save_quantized, writable_tensors= False, defaultConfigPath=base_config_file , forcedConfigPath= forcedConfigPath) + if self.transformer_switch: + shared_modules= {} + self.model = offload.fast_load_transformers_model(model_filename[:1], modules = model_filename[2:], modelClass=WanModel,do_quantize= quantizeTransformer and not save_quantized, writable_tensors= False, defaultConfigPath=base_config_file , forcedConfigPath= forcedConfigPath, return_shared_modules= shared_modules) + self.model2 = offload.fast_load_transformers_model(model_filename[1:2], modules = shared_modules, modelClass=WanModel,do_quantize= quantizeTransformer and not save_quantized, writable_tensors= False, defaultConfigPath=base_config_file , forcedConfigPath= forcedConfigPath) + shared_modules = None + else: + self.model = offload.fast_load_transformers_model(model_filename, modelClass=WanModel,do_quantize= quantizeTransformer and not save_quantized, writable_tensors= False, defaultConfigPath=base_config_file , forcedConfigPath= forcedConfigPath) + # self.model = offload.load_model_data(self.model, xmodel_filename ) # offload.load_model_data(self.model, "c:/temp/Phantom-Wan-1.3B.pth") + self.model.lock_layers_dtypes(torch.float32 if mixed_precision_transformer else dtype) offload.change_dtype(self.model, dtype, True) if self.model2 is not None: self.model2.lock_layers_dtypes(torch.float32 if mixed_precision_transformer else dtype) offload.change_dtype(self.model2, dtype, True) + # offload.save_model(self.model, "wan2.1_text2video_1.3B_mbf16.safetensors", do_quantize= False, config_file_path=base_config_file, filter_sd=sd) - # offload.save_model(self.model, "wan2.2_text2video_14B_high_mbf16.safetensors", config_file_path=base_config_file) - # offload.save_model(self.model, "wan2.2_text2video_14B_high_quanto_mfp16_int8.safetensors", do_quantize=True, config_file_path=base_config_file) + # offload.save_model(self.model, "wan2.2_image2video_14B_low_mbf16.safetensors", config_file_path=base_config_file) + # offload.save_model(self.model, "wan2.2_image2video_14B_low_quanto_mbf16_int8.safetensors", do_quantize=True, config_file_path=base_config_file) self.model.eval().requires_grad_(False) + if self.model2 is not None: + self.model2.eval().requires_grad_(False) if save_quantized: from wgp import save_quantized_model save_quantized_model(self.model, model_type, model_filename[0], dtype, base_config_file) @@ -480,8 +486,11 @@ class WanAny2V: any_end_frame = False if input_frames != None: _ , preframes_count, height, width = input_frames.shape - lat_h, lat_w = height // self.vae_stride[1], width // self.vae_stride[2] - clip_context = self.clip.visual([input_frames[:, -1:]]) if model_type != "flf2v_720p" else self.clip.visual([input_frames[:, -1:], input_frames[:, -1:]]) + lat_h, lat_w = height // self.vae_stride[1], width // self.vae_stride[2] + if hasattr(self, "clip"): + clip_context = self.clip.visual([input_frames[:, -1:]]) if model_type != "flf2v_720p" else self.clip.visual([input_frames[:, -1:], input_frames[:, -1:]]) + else: + clip_context = None input_frames = input_frames.to(device=self.device).to(dtype= self.VAE_dtype) enc = torch.concat( [input_frames, torch.zeros( (3, frame_num-preframes_count, height, width), device=self.device, dtype= self.VAE_dtype)], @@ -513,19 +522,24 @@ class WanAny2V: self.patch_size[2] * self.patch_size[2]) h = lat_h * self.vae_stride[1] w = lat_w * self.vae_stride[2] - clip_image_size = self.clip.model.image_size img_interpolated = resize_lanczos(image_start, h, w).sub_(0.5).div_(0.5).unsqueeze(0).transpose(0,1).to(self.device) #, self.dtype - image_start = resize_lanczos(image_start, clip_image_size, clip_image_size) - image_start = image_start.sub_(0.5).div_(0.5).to(self.device) #, self.dtype - color_reference_frame = image_start.clone() + color_reference_frame = img_interpolated.clone() if image_end!= None: img_interpolated2 = resize_lanczos(image_end, h, w).sub_(0.5).div_(0.5).unsqueeze(0).transpose(0,1).to(self.device) #, self.dtype - image_end = resize_lanczos(image_end, clip_image_size, clip_image_size) - image_end = image_end.sub_(0.5).div_(0.5).to(self.device) #, self.dtype - if model_type == "flf2v_720p": - clip_context = self.clip.visual([image_start[:, None, :, :], image_end[:, None, :, :] if image_end != None else image_start[:, None, :, :]]) + + if hasattr(self, "clip"): + clip_image_size = self.clip.model.image_size + image_start = resize_lanczos(image_start, clip_image_size, clip_image_size) + image_start = image_start.sub_(0.5).div_(0.5).to(self.device) #, self.dtype + if image_end!= None: + image_end = resize_lanczos(image_end, clip_image_size, clip_image_size) + image_end = image_end.sub_(0.5).div_(0.5).to(self.device) #, self.dtype + if model_type == "flf2v_720p": + clip_context = self.clip.visual([image_start[:, None, :, :], image_end[:, None, :, :] if image_end != None else image_start[:, None, :, :]]) + else: + clip_context = self.clip.visual([image_start[:, None, :, :]]) else: - clip_context = self.clip.visual([image_start[:, None, :, :]]) + clip_context = None if any_end_frame: enc= torch.concat([ @@ -563,7 +577,9 @@ class WanAny2V: extended_overlapped_latents = lat_y[:, :overlapped_latents_frames_num].clone().unsqueeze(0) y = torch.concat([msk, lat_y]) lat_y = None - kwargs.update({'clip_fea': clip_context, 'y': y}) + kwargs.update({ 'y': y}) + if not clip_context is None: + kwargs.update({'clip_fea': clip_context}) # Recam Master if target_camera != None: diff --git a/wan/modules/model.py b/wan/modules/model.py index a6e86f9..b67478e 100644 --- a/wan/modules/model.py +++ b/wan/modules/model.py @@ -789,7 +789,7 @@ class WanModel(ModelMixin, ConfigMixin): sd = new_sd from wgp import test_class_i2v - if not test_class_i2v(model_type): + if not test_class_i2v(model_type) or model_type in ["i2v_2_2"]: new_sd = {} # convert loras for i2v to t2v for k,v in sd.items(): @@ -842,7 +842,7 @@ class WanModel(ModelMixin, ConfigMixin): super().__init__() - assert model_type in ['t2v', 'i2v'] + assert model_type in ['t2v', 'i2v', 'i2v2_2'] self.model_type = model_type self.patch_size = patch_size @@ -889,7 +889,7 @@ class WanModel(ModelMixin, ConfigMixin): # blocks if vace_layers == None: - cross_attn_type = 't2v_cross_attn' if model_type == 't2v' else 'i2v_cross_attn' + cross_attn_type = 't2v_cross_attn' if model_type in ['t2v','i2v2_2'] else 'i2v_cross_attn' self.blocks = nn.ModuleList([ WanAttentionBlock(cross_attn_type, dim, ffn_dim, num_heads, window_size, qk_norm, cross_attn_norm, eps, block_no =i, output_dim=multitalk_output_dim, norm_input_visual=norm_input_visual) diff --git a/wgp.py b/wgp.py index 36ad1d8..76947ed 100644 --- a/wgp.py +++ b/wgp.py @@ -50,8 +50,8 @@ global_queue_ref = [] AUTOSAVE_FILENAME = "queue.zip" PROMPT_VARS_MAX = 10 -target_mmgp_version = "3.5.1" -WanGP_version = "7.4" +target_mmgp_version = "3.5.3" +WanGP_version = "7.5" settings_version = 2.23 max_source_video_frames = 3000 prompt_enhancer_image_caption_model, prompt_enhancer_image_caption_processor, prompt_enhancer_llm_model, prompt_enhancer_llm_tokenizer = None, None, None, None @@ -1581,7 +1581,7 @@ def _parse_args(): def get_lora_dir(model_type): model_family = get_model_family(model_type) - i2v = test_class_i2v(model_type) + i2v = test_class_i2v(model_type) and not get_base_model_type(model_type) == "i2v_2_2" if model_family == "wan": lora_dir =args.lora_dir if i2v and len(lora_dir)==0: @@ -1691,7 +1691,8 @@ for path in ["wan2.1_Vace_1.3B_preview_bf16.safetensors", "sky_reels2_diffusion "sky_reels2_diffusion_forcing_720p_14B_quanto_int8.safetensors", "sky_reels2_diffusion_forcing_720p_14B_quanto_fp16_int8.safetensors", "wan2.1_image2video_480p_14B_bf16.safetensors", "wan2.1_image2video_480p_14B_quanto_int8.safetensors", "wan2.1_image2video_720p_14B_quanto_int8.safetensors", "wan2.1_image2video_720p_14B_quanto_fp16_int8.safetensors", "wan2.1_image2video_720p_14B_bf16.safetensors", "wan2.1_text2video_14B_bf16.safetensors", "wan2.1_text2video_14B_quanto_int8.safetensors", -"wan2.1_Vace_14B_mbf16.safetensors", "wan2.1_Vace_14B_quanto_mbf16_int8.safetensors", "wan2.1_FLF2V_720p_14B_quanto_int8.safetensors", "wan2.1_FLF2V_720p_14B_bf16.safetensors", "wan2.1_FLF2V_720p_14B_fp16.safetensors", "wan2.1_Vace_1.3B_mbf16.safetensors", "wan2.1_text2video_1.3B_bf16.safetensors" +"wan2.1_Vace_14B_mbf16.safetensors", "wan2.1_Vace_14B_quanto_mbf16_int8.safetensors", "wan2.1_FLF2V_720p_14B_quanto_int8.safetensors", "wan2.1_FLF2V_720p_14B_bf16.safetensors", "wan2.1_FLF2V_720p_14B_fp16.safetensors", "wan2.1_Vace_1.3B_mbf16.safetensors", "wan2.1_text2video_1.3B_bf16.safetensors", +"ltxv_0.9.7_13B_dev_bf16.safetensors" ]: if Path(os.path.join("ckpts" , path)).is_file(): print(f"Removing old version of model '{path}'. A new version of this model will be downloaded next time you use it.") @@ -1712,7 +1713,7 @@ modules_files = { base_types = ["multitalk", "fantasy", "vace_14B", "vace_multitalk_14B", "t2v_1.3B", "t2v", "vace_1.3B", "phantom_1.3B", "phantom_14B", "recam_1.3B", "sky_df_1.3B", "sky_df_14B", - "i2v", "flf2v_720p", "fun_inp_1.3B", "fun_inp", "ltxv_13B", + "i2v", "i2v_2_2", "flf2v_720p", "fun_inp_1.3B", "fun_inp", "ltxv_13B", "hunyuan", "hunyuan_i2v", "hunyuan_custom", "hunyuan_custom_audio", "hunyuan_custom_edit", "hunyuan_avatar", "flux" ] @@ -1792,7 +1793,7 @@ def get_model_family(model_type, for_ui = False): def test_class_i2v(model_type): model_type = get_base_model_type(model_type) - return model_type in ["i2v", "fun_inp_1.3B", "fun_inp", "flf2v_720p", "fantasy", "multitalk" ] #"hunyuan_i2v", + return model_type in ["i2v", "i2v_2_2", "fun_inp_1.3B", "fun_inp", "flf2v_720p", "fantasy", "multitalk" ] #"hunyuan_i2v", def test_vace_module(model_type): model_type = get_base_model_type(model_type) @@ -2632,7 +2633,8 @@ def load_wan_model(model_filename, model_type, base_model_type, model_def, quant pipe = {"transformer": wan_model.model, "text_encoder" : wan_model.text_encoder.model, "vae": wan_model.vae.model } if wan_model.model2 is not None: pipe["transformer2"] = wan_model.model2 - + # del pipe["transformer"] + # pipe["transformer"] = wan_model.model if hasattr(wan_model, "clip"): pipe["text_encoder_2"] = wan_model.clip.model return wan_model, pipe @@ -2803,10 +2805,9 @@ def load_models(model_type): if "transformer2" in pipe: loras_transformer += ["transformer2"] - if profile in [2,4]: + if profile in [3,4]: kwargs["pinnedMemory"] = ["transformer", "transformer2"] - global prompt_enhancer_image_caption_model, prompt_enhancer_image_caption_processor, prompt_enhancer_llm_model, prompt_enhancer_llm_tokenizer if server_config.get("enhancer_enabled", 0) == 1: from transformers import ( AutoModelForCausalLM, AutoProcessor, AutoTokenizer, LlamaForCausalLM ) @@ -3317,11 +3318,15 @@ def select_video(state, input_file_list, event_data: gr.EventData): if video_length != frames_count: video_length_summary += f"real: {frames_count} frames, " video_length_summary += f"{frames_count/fps:.1f}s, {round(fps)} fps)" video_guidance_scale = configs.get("guidance_scale", None) + video_guidance2_scale = configs.get("guidance2_scale", None) + video_switch_threshold = configs.get("switch_threshold", 0) video_embedded_guidance_scale = configs.get("embedded_guidance_scale ", None) if model_family in ["hunyuan", "flux"]: video_guidance_scale = video_embedded_guidance_scale video_guidance_label = "Embedded Guidance Scale" else: + if video_switch_threshold > 0: + video_guidance_scale = f"{video_guidance_scale} (High Noise), {video_guidance2_scale} (Low Noise) with Switch at Noise Level {video_switch_threshold}" video_guidance_label = "Guidance" video_flow_shift = configs.get("flow_shift", None) video_video_guide_outpainting = configs.get("video_guide_outpainting", "") @@ -4232,15 +4237,15 @@ def generate_video( loras_selected = transformer_loras_filenames + loras_selected loras_list_mult_choices_nums = transformer_loras_multipliers + loras_list_mult_choices_nums loras_slists = transformer_loras_multipliers + loras_slists - trans_list = [trans] - if trans2 is not None: trans_list += [trans2] - for trans_item in trans_list: - offload.load_loras_into_model(trans_item, loras_selected, loras_list_mult_choices_nums, activate_all_loras=True, preprocess_sd=get_loras_preprocessor(trans, base_model_type), pinnedLora=pinnedLora, split_linear_modules_map = split_linear_modules_map) - errors = trans._loras_errors - if len(errors) > 0: - error_files = [msg for _ , msg in errors] - raise gr.Error("Error while loading Loras: " + ", ".join(error_files)) - trans_item = trans_list = None + + offload.load_loras_into_model(trans , loras_selected, loras_list_mult_choices_nums, activate_all_loras=True, preprocess_sd=get_loras_preprocessor(trans, base_model_type), pinnedLora=pinnedLora, split_linear_modules_map = split_linear_modules_map) + errors = trans._loras_errors + if len(errors) > 0: + error_files = [msg for _ , msg in errors] + raise gr.Error("Error while loading Loras: " + ", ".join(error_files)) + if trans2 is not None: + offload.sync_models_loras(trans, trans2) + seed = None if seed == -1 else seed # negative_prompt = "" # not applicable in the inference original_filename = model_filename @@ -4619,7 +4624,9 @@ def generate_video( if len(extra_process_list) == 1: status_info += " and " + processes_names[extra_process_list[0]] elif len(extra_process_list) == 2: - status_info += ", " + processes_names[extra_process_list[0]] + " and " + processes_names[extra_process_list[1]] + status_info += ", " + processes_names[extra_process_list[0]] + " and " + processes_names[extra_process_list[1]] + if preprocess_type2 is not None: + context_scale = [ control_net_weight /2, control_net_weight2 /2] send_cmd("progress", [0, get_latest_status(state, status_info)]) video_guide_processed, video_mask_processed = preprocess_video_with_mask(video_guide, video_mask, height=image_size[0], width = image_size[1], max_frames= len(keep_frames_parsed) , start_frame = guide_start_frame, fit_canvas = sample_fit_canvas, target_fps = fps, process_type = preprocess_type, expand_scale = mask_expand, RGB_Mask = True, negate_mask = "N" in video_prompt_type, process_outside_mask = process_outside_mask, outpainting_dims = outpainting_dims, proc_no =1 ) if preprocess_type2 != None: @@ -4768,6 +4775,7 @@ def generate_video( remove_temp_filenames(temp_filenames_list) offloadobj.unload_all() offload.unload_loras_from_model(trans) + if trans is not None: offload.unload_loras_from_model(trans2) # if compile: # cache_size = torch._dynamo.config.cache_size_limit # torch.compiler.reset()