From 7bcd7246217954956536515d8b220f349cb5f288 Mon Sep 17 00:00:00 2001 From: DeepBeepMeep Date: Mon, 15 Sep 2025 19:28:18 +0200 Subject: [PATCH] attack of the clones --- README.md | 228 +--------- defaults/qwen_image_edit_20B.json | 1 + defaults/vace_fun_14B_2_2.json | 24 ++ defaults/vace_fun_14B_cocktail_2_2.json | 28 ++ docs/CHANGELOG.md | 144 ++++++- models/flux/flux_handler.py | 2 +- models/flux/sampling.py | 2 +- models/hyvideo/hunyuan_handler.py | 10 +- models/ltx_video/ltxv.py | 4 +- models/ltx_video/ltxv_handler.py | 2 +- models/qwen/pipeline_qwenimage.py | 16 +- models/qwen/qwen_handler.py | 27 +- models/qwen/qwen_main.py | 14 +- models/wan/any2video.py | 90 ++-- models/wan/df_handler.py | 2 +- models/wan/multitalk/multitalk.py | 18 +- models/wan/wan_handler.py | 9 +- preprocessing/extract_vocals.py | 69 +++ preprocessing/speakers_separator.py | 12 +- requirements.txt | 5 +- wgp.py | 530 +++++++++++++++--------- 21 files changed, 759 insertions(+), 478 deletions(-) create mode 100644 defaults/vace_fun_14B_2_2.json create mode 100644 defaults/vace_fun_14B_cocktail_2_2.json create mode 100644 preprocessing/extract_vocals.py diff --git a/README.md b/README.md index d33b6dc..2411cea 100644 --- a/README.md +++ b/README.md @@ -20,6 +20,19 @@ WanGP supports the Wan (and derived models), Hunyuan Video and LTV Video models **Follow DeepBeepMeep on Twitter/X to get the Latest News**: https://x.com/deepbeepmeep ## 🔥 Latest Updates : +### September 15 2025: WanGP v8.6 - Attack of the Clones + +- The long awaited **Vace for Wan 2.2** is at last here or maybe not: it has been released by the *Fun Team* of *Alibaba* and it is not official. You can play with the vanilla version (**Vace Fun**) or with the one accelerated with Loras (**Vace Fan Cocktail**) + +- **First Frame / Last Frame for Vace** : Vace model are so powerful that they could do *First frame / Last frame* since day one using the *Injected Frames* feature. However this required to compute by hand the locations of each end frame since this feature expects frames positions. I made it easier to compute these locations by using the "L" alias : + +For a video Gen from scratch *"1 L L L"* means the 4 Injected Frames will be injected like this: frame no 1 at the first position, the next frame at the end of the first window, then the following frame at the end of the next window, and so on .... +If you *Continue a Video* , you just need *"L L L"* since the the first frame is the last frame of the *Source Video*. In any case remember that numeral frames positions (like "1") are aligned by default to the beginning of the source window, so low values such as 1 will be considered in the past unless you change this behaviour in *Sliding Window Tab/ Control Video, Injected Frames aligment*. + +- **Qwen Inpainting** exist now in two versions: the original version of the previous release and a Lora based version. Each version has its pros and cons. For instance the Lora version supports also **Outpainting** ! However it tends to change slightly the original image even outside the outpainted area. + +- **Better Lipsync with all the Audio to Video models**: you probably noticed that *Multitalk*, *InfiniteTalk* or *Hunyuan Avatar* had so so lipsync when the audio provided contained some background music. The problem should be solved now thanks to an automated background music removal all done by IA. Don't worry you will still hear the music as it is added back in the generated Video. + ### September 11 2025: WanGP v8.5/8.55 - Wanna be a Cropper or a Painter ? I have done some intensive internal refactoring of the generation pipeline to ease support of existing models or add new models. Nothing really visible but this makes WanGP is little more future proof. @@ -74,221 +87,6 @@ You will find below a 33s movie I have created using these two methods. Quality *update 8.31: one shouldnt talk about bugs if one doesn't want to attract bugs* -### August 29 2025: WanGP v8.21 - Here Goes Your Weekend - -- **InfiniteTalk Video to Video**: this feature can be used for Video Dubbing. Keep in mind that it is a *Sparse Video to Video*, that is internally only image is used by Sliding Window. However thanks to the new *Smooth Transition* mode, each new clip is connected to the previous and all the camera work is done by InfiniteTalk. If you dont get any transition, increase the number of frames of a Sliding Window (81 frames recommended) - -- **StandIn**: very light model specialized in Identity Transfer. I have provided two versions of Standin: a basic one derived from the text 2 video model and another based on Vace. If used with Vace, the last reference frame given to Vace will be also used for StandIn - -- **Flux ESO**: a new Flux dervied *Image Editing tool*, but this one is specialized both in *Identity Transfer* and *Style Transfer*. Style has to be understood in its wide meaning: give a reference picture of a person and another one of Sushis and you will turn this person into Sushis - -### August 24 2025: WanGP v8.1 - the RAM Liberator - -- **Reserved RAM entirely freed when switching models**, you should get much less out of memory related to RAM. I have also added a button in *Configuration / Performance* that will release most of the RAM used by WanGP if you want to use another application without quitting WanGP -- **InfiniteTalk** support: improved version of Multitalk that supposedly supports very long video generations based on an audio track. Exists in two flavors (*Single Speaker* and *Multi Speakers*) but doesnt seem to be compatible with Vace. One key new feature compared to Multitalk is that you can have different visual shots associated to the same audio: each Reference frame you provide you will be associated to a new Sliding Window. If only Reference frame is provided, it will be used for all windows. When Continuing a video, you can either continue the current shot (no Reference Frame) or add new shots (one or more Reference Frames).\ -If you are not into audio, you can use still this model to generate infinite long image2video, just select "no speaker". Last but not least, Infinitetalk works works with all the Loras accelerators. -- **Flux Chroma 1 HD** support: uncensored flux based model and lighter than Flux (8.9B versus 12B) and can fit entirely in VRAM with only 16 GB of VRAM. Unfortunalely it is not distilled and you will need CFG at minimum 20 steps - -### August 21 2025: WanGP v8.01 - the killer of seven - -- **Qwen Image Edit** : Flux Kontext challenger (prompt driven image edition). Best results (including Identity preservation) will be obtained at 720p. Beyond you may get image outpainting and / or lose identity preservation. Below 720p prompt adherence will be worse. Qwen Image Edit works with Qwen Lora Lightning 4 steps. I have also unlocked all the resolutions for Qwen models. Bonus Zone: support for multiple image compositions but identity preservation won't be as good. -- **On demand Prompt Enhancer** (needs to be enabled in Configuration Tab) that you can use to Enhance a Text Prompt before starting a Generation. You can refine the Enhanced Prompt or change the original Prompt. -- Choice of a **Non censored Prompt Enhancer**. Beware this is one is VRAM hungry and will require 12 GB of VRAM to work -- **Memory Profile customizable per model** : useful to set for instance Profile 3 (preload the model entirely in VRAM) with only Image Generation models, if you have 24 GB of VRAM. In that case Generation will be much faster because with Image generators (contrary to Video generators) as a lot of time is wasted in offloading -- **Expert Guidance Mode**: change the Guidance during the generation up to 2 times. Very useful with Wan 2.2 Ligthning to reduce the slow motion effect. The idea is to insert a CFG phase before the 2 accelerated phases that follow and have no Guidance. I have added the finetune *Wan2.2 Vace Lightning 3 Phases 14B* with a prebuilt configuration. Please note that it is a 8 steps process although the lora lightning is 4 steps. This expert guidance mode is also available with Wan 2.1. - -*WanGP 8.01 update, improved Qwen Image Edit Identity Preservation* -### August 12 2025: WanGP v7.7777 - Lucky Day(s) - -This is your lucky day ! thanks to new configuration options that will let you store generated Videos and Images in lossless compressed formats, you will find they in fact they look two times better without doing anything ! - -Just kidding, they will be only marginally better, but at least this opens the way to professionnal editing. - -Support: -- Video: x264, x264 lossless, x265 -- Images: jpeg, png, webp, wbp lossless -Generation Settings are stored in each of the above regardless of the format (that was the hard part). - -Also you can now choose different output directories for images and videos. - -unexpected luck: fixed lightning 8 steps for Qwen, and lightning 4 steps for Wan 2.2, now you just need 1x multiplier no weird numbers. -*update 7.777 : oops got a crash a with FastWan ? Luck comes and goes, try a new update, maybe you will have a better chance this time* -*update 7.7777 : Sometime good luck seems to last forever. For instance what if Qwen Lightning 4 steps could also work with WanGP ?* -- https://huggingface.co/lightx2v/Qwen-Image-Lightning/resolve/main/Qwen-Image-Lightning-4steps-V1.0-bf16.safetensors (Qwen Lightning 4 steps) -- https://huggingface.co/lightx2v/Qwen-Image-Lightning/resolve/main/Qwen-Image-Lightning-8steps-V1.1-bf16.safetensors (new improved version of Qwen Lightning 8 steps) - - -### August 10 2025: WanGP v7.76 - Faster than the VAE ... -We have a funny one here today: FastWan 2.2 5B, the Fastest Video Generator, only 20s to generate 121 frames at 720p. The snag is that VAE is twice as slow... -Thanks to Kijai for extracting the Lora that is used to build the corresponding finetune. - -*WanGP 7.76: fixed the messed up I did to i2v models (loras path was wrong for Wan2.2 and Clip broken)* - -### August 9 2025: WanGP v7.74 - Qwen Rebirth part 2 -Added support for Qwen Lightning lora for a 8 steps generation (https://huggingface.co/lightx2v/Qwen-Image-Lightning/blob/main/Qwen-Image-Lightning-8steps-V1.0.safetensors). Lora is not normalized and you can use a multiplier around 0.1. - -Mag Cache support for all the Wan2.2 models Don't forget to set guidance to 1 and 8 denoising steps , your gen will be 7x faster ! - -### August 8 2025: WanGP v7.73 - Qwen Rebirth -Ever wondered what impact not using Guidance has on a model that expects it ? Just look at Qween Image in WanGP 7.71 whose outputs were erratic. Somehow I had convinced myself that Qwen was a distilled model. In fact Qwen was dying for a negative prompt. And in WanGP 7.72 there is at last one for him. - -As Qwen is not so picky after all I have added also quantized text encoder which reduces the RAM requirements of Qwen by 10 GB (the text encoder quantized version produced garbage before) - -Unfortunately still the Sage bug for older GPU architectures. Added Sdpa fallback for these architectures. - -*7.73 update: still Sage / Sage2 bug for GPUs before RTX40xx. I have added a detection mechanism that forces Sdpa attention if that's the case* - - -### August 6 2025: WanGP v7.71 - Picky, picky - -This release comes with two new models : -- Qwen Image: a Commercial grade Image generator capable to inject full sentences in the generated Image while still offering incredible visuals -- Wan 2.2 TextImage to Video 5B: the last Wan 2.2 needed if you want to complete your Wan 2.2 collection (loras for this folder can be stored in "\loras\5B" ) - -There is catch though, they are very picky if you want to get good generations: first they both need lots of steps (50 ?) to show what they have to offer. Then for Qwen Image I had to hardcode the supported resolutions, because if you try anything else, you will get garbage. Likewise Wan 2.2 5B will remind you of Wan 1.0 if you don't ask for at least 720p. - -*7.71 update: Added VAE Tiling for both Qwen Image and Wan 2.2 TextImage to Video 5B, for low VRAM during a whole gen.* - - -### August 4 2025: WanGP v7.6 - Remuxed - -With this new version you won't have any excuse if there is no sound in your video. - -*Continue Video* now works with any video that has already some sound (hint: Multitalk ). - -Also, on top of MMaudio and the various sound driven models I have added the ability to use your own soundtrack. - -As a result you can apply a different sound source on each new video segment when doing a *Continue Video*. - -For instance: -- first video part: use Multitalk with two people speaking -- second video part: you apply your own soundtrack which will gently follow the multitalk conversation -- third video part: you use Vace effect and its corresponding control audio will be concatenated to the rest of the audio - -To multiply the combinations I have also implemented *Continue Video* with the various image2video models. - -Also: -- End Frame support added for LTX Video models -- Loras can now be targetted specifically at the High noise or Low noise models with Wan 2.2, check the Loras and Finetune guides -- Flux Krea Dev support - -### July 30 2025: WanGP v7.5: Just another release ... Wan 2.2 part 2 -Here is now Wan 2.2 image2video a very good model if you want to set Start and End frames. Two Wan 2.2 models delivered, only one to go ... - -Please note that although it is an image2video model it is structurally very close to Wan 2.2 text2video (same layers with only a different initial projection). Given that Wan 2.1 image2video loras don't work too well (half of their tensors are not supported), I have decided that this model will look for its loras in the text2video loras folder instead of the image2video folder. - -I have also optimized RAM management with Wan 2.2 so that loras and modules will be loaded only once in RAM and Reserved RAM, this saves up to 5 GB of RAM which can make a difference... - -And this time I really removed Vace Cocktail Light which gave a blurry vision. - -### July 29 2025: WanGP v7.4: Just another release ... Wan 2.2 Preview -Wan 2.2 is here. The good news is that WanGP wont require a single byte of extra VRAM to run it and it will be as fast as Wan 2.1. The bad news is that you will need much more RAM if you want to leverage entirely this new model since it has twice has many parameters. - -So here is a preview version of Wan 2.2 that is without the 5B model and Wan 2.2 image to video for the moment. - -However as I felt bad to deliver only half of the wares, I gave you instead .....** Wan 2.2 Vace Experimental Cocktail** ! - -Very good surprise indeed, the loras and Vace partially work with Wan 2.2. We will need to wait for the official Vace 2.2 release since some Vace features are broken like identity preservation - -Bonus zone: Flux multi images conditions has been added, or maybe not if I broke everything as I have been distracted by Wan... - -7.4 update: I forgot to update the version number. I also removed Vace Cocktail light which didnt work well. - -### July 27 2025: WanGP v7.3 : Interlude -While waiting for Wan 2.2, you will appreciate the model selection hierarchy which is very useful to collect even more models. You will also appreciate that WanGP remembers which model you used last in each model family. - -### July 26 2025: WanGP v7.2 : Ode to Vace -I am really convinced that Vace can do everything the other models can do and in a better way especially as Vace can be combined with Multitalk. - -Here are some new Vace improvements: -- I have provided a default finetune named *Vace Cocktail* which is a model created on the fly using the Wan text 2 video model and the Loras used to build FusioniX. The weight of the *Detail Enhancer* Lora has been reduced to improve identity preservation. Copy the model definition in *defaults/vace_14B_cocktail.json* in the *finetunes/* folder to change the Cocktail composition. Cocktail contains already some Loras acccelerators so no need to add on top a Lora Accvid, Causvid or Fusionix, ... . The whole point of Cocktail is to be able to build you own FusioniX (which originally is a combination of 4 loras) but without the inconvenient of FusioniX. -- Talking about identity preservation, it tends to go away when one generates a single Frame instead of a Video which is shame for our Vace photoshop. But there is a solution : I have added an Advanced Quality option, that tells WanGP to generate a little more than a frame (it will still keep only the first frame). It will be a little slower but you will be amazed how Vace Cocktail combined with this option will preserve identities (bye bye *Phantom*). -- As in practise I have observed one switches frequently between *Vace text2video* and *Vace text2image* I have put them in the same place they are now just one tab away, no need to reload the model. Likewise *Wan text2video* and *Wan tex2image* have been merged. -- Color fixing when using Sliding Windows. A new postprocessing *Color Correction* applied automatically by default (you can disable it in the *Advanced tab Sliding Window*) will try to match the colors of the new window with that of the previous window. It doesnt fix all the unwanted artifacts of the new window but at least this makes the transition smoother. Thanks to the multitalk team for the original code. - -Also you will enjoy our new real time statistics (CPU / GPU usage, RAM / VRAM used, ... ). Many thanks to **Redtash1** for providing the framework for this new feature ! You need to go in the Config tab to enable real time stats. - - -### July 21 2025: WanGP v7.12 -- Flux Family Reunion : *Flux Dev* and *Flux Schnell* have been invited aboard WanGP. To celebrate that, Loras support for the Flux *diffusers* format has also been added. - -- LTX Video upgraded to version 0.9.8: you can now generate 1800 frames (1 min of video !) in one go without a sliding window. With the distilled model it will take only 5 minutes with a RTX 4090 (you will need 22 GB of VRAM though). I have added options to select higher humber frames if you want to experiment (go to Configuration Tab / General / Increase the Max Number of Frames, change the value and restart the App) - -- LTX Video ControlNet : it is a Control Net that allows you for instance to transfer a Human motion or Depth from a control video. It is not as powerful as Vace but can produce interesting things especially as now you can generate quickly a 1 min video. Under the scene IC-Loras (see below) for Pose, Depth and Canny are automatically loaded for you, no need to add them. - -- LTX IC-Lora support: these are special Loras that consumes a conditional image or video -Beside the pose, depth and canny IC-Loras transparently loaded there is the *detailer* (https://huggingface.co/Lightricks/LTX-Video-ICLoRA-detailer-13b-0.9.8) which is basically an upsampler. Add the *detailer* as a Lora and use LTX Raw Format as control net choice to use it. - -- Matanyone is now also for the GPU Poor as its VRAM requirements have been divided by 2! (7.12 shadow update) - -- Easier way to select video resolution - - -### July 15 2025: WanGP v7.0 is an AI Powered Photoshop -This release turns the Wan models into Image Generators. This goes way more than allowing to generate a video made of single frame : -- Multiple Images generated at the same time so that you can choose the one you like best.It is Highly VRAM optimized so that you can generate for instance 4 720p Images at the same time with less than 10 GB -- With the *image2image* the original text2video WanGP becomes an image upsampler / restorer -- *Vace image2image* comes out of the box with image outpainting, person / object replacement, ... -- You can use in one click a newly Image generated as Start Image or Reference Image for a Video generation - -And to complete the full suite of AI Image Generators, Ladies and Gentlemen please welcome for the first time in WanGP : **Flux Kontext**.\ -As a reminder Flux Kontext is an image editor : give it an image and a prompt and it will do the change for you.\ -This highly optimized version of Flux Kontext will make you feel that you have been cheated all this time as WanGP Flux Kontext requires only 8 GB of VRAM to generate 4 images at the same time with no need for quantization. - -WanGP v7 comes with *Image2image* vanilla and *Vace FusinoniX*. However you can build your own finetune where you will combine a text2video or Vace model with any combination of Loras. - -Also in the news: -- You can now enter the *Bbox* for each speaker in *Multitalk* to precisely locate who is speaking. And to save some headaches the *Image Mask generator* will give you the *Bbox* coordinates of an area you have selected. -- *Film Grain* post processing to add a vintage look at your video -- *First Last Frame to Video* model should work much better now as I have discovered rencently its implementation was not complete -- More power for the finetuners, you can now embed Loras directly in the finetune definition. You can also override the default models (titles, visibility, ...) with your own finetunes. Check the doc that has been updated. - - -### July 10 2025: WanGP v6.7, is NAG a game changer ? you tell me -Maybe you knew that already but most *Loras accelerators* we use today (Causvid, FusioniX) don't use *Guidance* at all (that it is *CFG* is set to 1). This helps to get much faster generations but the downside is that *Negative Prompts* are completely ignored (including the default ones set by the models). **NAG** (https://github.com/ChenDarYen/Normalized-Attention-Guidance) aims to solve that by injecting the *Negative Prompt* during the *attention* processing phase. - -So WanGP 6.7 gives you NAG, but not any NAG, a *Low VRAM* implementation, the default one ends being VRAM greedy. You will find NAG in the *General* advanced tab for most Wan models. - -Use NAG especially when Guidance is set to 1. To turn it on set the **NAG scale** to something around 10. There are other NAG parameters **NAG tau** and **NAG alpha** which I recommend to change only if you don't get good results by just playing with the NAG scale. Don't hesitate to share on this discord server the best combinations for these 3 parameters. - -The authors of NAG claim that NAG can also be used when using a Guidance (CFG > 1) and to improve the prompt adherence. - -### July 8 2025: WanGP v6.6, WanGP offers you **Vace Multitalk Dual Voices Fusionix Infinite** : -**Vace** our beloved super Control Net has been combined with **Multitalk** the new king in town that can animate up to two people speaking (**Dual Voices**). It is accelerated by the **Fusionix** model and thanks to *Sliding Windows* support and *Adaptive Projected Guidance* (much slower but should reduce the reddish effect with long videos) your two people will be able to talk for very a long time (which is an **Infinite** amount of time in the field of video generation). - -Of course you will get as well *Multitalk* vanilla and also *Multitalk 720p* as a bonus. - -And since I am mister nice guy I have enclosed as an exclusivity an *Audio Separator* that will save you time to isolate each voice when using Multitalk with two people. - -As I feel like resting a bit I haven't produced yet a nice sample Video to illustrate all these new capabilities. But here is the thing, I ams sure you will publish in the *Share Your Best Video* channel your *Master Pieces*. The best ones will be added to the *Announcements Channel* and will bring eternal fame to its authors. - -But wait, there is more: -- Sliding Windows support has been added anywhere with Wan models, so imagine with text2video recently upgraded in 6.5 into a video2video, you can now upsample very long videos regardless of your VRAM. The good old image2video model can now reuse the last image to produce new videos (as requested by many of you) -- I have added also the capability to transfer the audio of the original control video (Misc. advanced tab) and an option to preserve the fps into the generated video, so from now on you will be to upsample / restore your old families video and keep the audio at their original pace. Be aware that the duration will be limited to 1000 frames as I still need to add streaming support for unlimited video sizes. - -Also, of interest too: -- Extract video info from Videos that have not been generated by WanGP, even better you can also apply post processing (Upsampling / MMAudio) on non WanGP videos -- Force the generated video fps to your liking, works wery well with Vace when using a Control Video -- Ability to chain URLs of Finetune models (for instance put the URLs of a model in your main finetune and reference this finetune in other finetune models to save time) - -### July 2 2025: WanGP v6.5.1, WanGP takes care of you: lots of quality of life features: -- View directly inside WanGP the properties (seed, resolutions, length, most settings...) of the past generations -- In one click use the newly generated video as a Control Video or Source Video to be continued -- Manage multiple settings for the same model and switch between them using a dropdown box -- WanGP will keep the last generated videos in the Gallery and will remember the last model you used if you restart the app but kept the Web page open -- Custom resolutions : add a file in the WanGP folder with the list of resolutions you want to see in WanGP (look at the instruction readme in this folder) - -Taking care of your life is not enough, you want new stuff to play with ? -- MMAudio directly inside WanGP : add an audio soundtrack that matches the content of your video. By the way it is a low VRAM MMAudio and 6 GB of VRAM should be sufficient. You will need to go in the *Extensions* tab of the WanGP *Configuration* to enable MMAudio -- Forgot to upsample your video during the generation ? want to try another MMAudio variation ? Fear not you can also apply upsampling or add an MMAudio track once the video generation is done. Even better you can ask WangGP for multiple variations of MMAudio to pick the one you like best -- MagCache support: a new step skipping approach, supposed to be better than TeaCache. Makes a difference if you usually generate with a high number of steps -- SageAttention2++ support : not just the compatibility but also a slightly reduced VRAM usage -- Video2Video in Wan Text2Video : this is the paradox, a text2video can become a video2video if you start the denoising process later on an existing video -- FusioniX upsampler: this is an illustration of Video2Video in Text2Video. Use the FusioniX text2video model with an output resolution of 1080p and a denoising strength of 0.25 and you will get one of the best upsamplers (in only 2/3 steps, you will need lots of VRAM though). Increase the denoising strength and you will get one of the best Video Restorer -- Choice of Wan Samplers / Schedulers -- More Lora formats support - -**If you had upgraded to v6.5 please upgrade again to 6.5.1 as this will fix a bug that ignored Loras beyond the first one** See full changelog: **[Changelog](docs/CHANGELOG.md)** diff --git a/defaults/qwen_image_edit_20B.json b/defaults/qwen_image_edit_20B.json index 79b8b24..04fc573 100644 --- a/defaults/qwen_image_edit_20B.json +++ b/defaults/qwen_image_edit_20B.json @@ -7,6 +7,7 @@ "https://huggingface.co/DeepBeepMeep/Qwen_image/resolve/main/qwen_image_edit_20B_bf16.safetensors", "https://huggingface.co/DeepBeepMeep/Qwen_image/resolve/main/qwen_image_edit_20B_quanto_bf16_int8.safetensors" ], + "preload_URLs": ["https://huggingface.co/DeepBeepMeep/Qwen_image/resolve/main/qwen_image_edit_inpainting.safetensors"], "attention": { "<89": "sdpa" } diff --git a/defaults/vace_fun_14B_2_2.json b/defaults/vace_fun_14B_2_2.json new file mode 100644 index 0000000..8f22d34 --- /dev/null +++ b/defaults/vace_fun_14B_2_2.json @@ -0,0 +1,24 @@ +{ + "model": { + "name": "Wan2.2 Vace Fun 14B", + "architecture": "vace_14B", + "description": "This is the Fun Vace 2.2 version, that is not the official Vace 2.2", + "URLs": [ + "https://huggingface.co/DeepBeepMeep/Wan2.2/resolve/main/Wan2_2_Fun_VACE_A14B_HIGH_mbf16.safetensors", + "https://huggingface.co/DeepBeepMeep/Wan2.2/resolve/main/Wan2_2_Fun_VACE_A14B_HIGH_quanto_mbf16_int8.safetensors", + "https://huggingface.co/DeepBeepMeep/Wan2.2/resolve/main/Wan2_2_Fun_VACE_A14B_HIGH_quanto_mfp16_int8.safetensors" + ], + "URLs2": [ + "https://huggingface.co/DeepBeepMeep/Wan2.2/resolve/main/Wan2_2_Fun_VACE_A14B_LOW_mbf16.safetensors", + "https://huggingface.co/DeepBeepMeep/Wan2.2/resolve/main/Wan2_2_Fun_VACE_A14B_LOW_quanto_mbf16_int8.safetensors", + "https://huggingface.co/DeepBeepMeep/Wan2.2/resolve/main/Wan2_2_Fun_VACE_A14B_LOW_quanto_mfp16_int8.safetensors" + ], + "group": "wan2_2" + }, + "guidance_phases": 2, + "num_inference_steps": 30, + "guidance_scale": 1, + "guidance2_scale": 1, + "flow_shift": 2, + "switch_threshold": 875 +} \ No newline at end of file diff --git a/defaults/vace_fun_14B_cocktail_2_2.json b/defaults/vace_fun_14B_cocktail_2_2.json new file mode 100644 index 0000000..c587abd --- /dev/null +++ b/defaults/vace_fun_14B_cocktail_2_2.json @@ -0,0 +1,28 @@ +{ + "model": { + "name": "Wan2.2 Vace Fun Cocktail 14B", + "architecture": "vace_14B", + "description": "This model has been created on the fly using the Wan text 2.2 video model and the Loras of FusioniX. The weight of the Detail Enhancer Lora has been reduced to improve identity preservation. This is the Fun Vace 2.2, that is not the official Vace 2.2", + "URLs": "vace_fun_14B_2_2", + "URLs2": "vace_fun_14B_2_2", + "loras": [ + "https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/loras_accelerators/Wan21_CausVid_14B_T2V_lora_rank32_v2.safetensors", + "https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/loras_accelerators/DetailEnhancerV1.safetensors", + "https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/loras_accelerators/Wan21_AccVid_T2V_14B_lora_rank32_fp16.safetensors", + "https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/loras_accelerators/Wan21_T2V_14B_MoviiGen_lora_rank32_fp16.safetensors" + ], + "loras_multipliers": [ + 1, + 0.2, + 0.5, + 0.5 + ], + "group": "wan2_2" + }, + "guidance_phases": 2, + "num_inference_steps": 10, + "guidance_scale": 1, + "guidance2_scale": 1, + "flow_shift": 2, + "switch_threshold": 875 +} \ No newline at end of file diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index 5a89d93..b0eeae3 100644 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -1,20 +1,154 @@ # Changelog ## 🔥 Latest News -### July 21 2025: WanGP v7.1 +### August 29 2025: WanGP v8.21 - Here Goes Your Weekend + +- **InfiniteTalk Video to Video**: this feature can be used for Video Dubbing. Keep in mind that it is a *Sparse Video to Video*, that is internally only image is used by Sliding Window. However thanks to the new *Smooth Transition* mode, each new clip is connected to the previous and all the camera work is done by InfiniteTalk. If you dont get any transition, increase the number of frames of a Sliding Window (81 frames recommended) + +- **StandIn**: very light model specialized in Identity Transfer. I have provided two versions of Standin: a basic one derived from the text 2 video model and another based on Vace. If used with Vace, the last reference frame given to Vace will be also used for StandIn + +- **Flux ESO**: a new Flux dervied *Image Editing tool*, but this one is specialized both in *Identity Transfer* and *Style Transfer*. Style has to be understood in its wide meaning: give a reference picture of a person and another one of Sushis and you will turn this person into Sushis + +### August 24 2025: WanGP v8.1 - the RAM Liberator + +- **Reserved RAM entirely freed when switching models**, you should get much less out of memory related to RAM. I have also added a button in *Configuration / Performance* that will release most of the RAM used by WanGP if you want to use another application without quitting WanGP +- **InfiniteTalk** support: improved version of Multitalk that supposedly supports very long video generations based on an audio track. Exists in two flavors (*Single Speaker* and *Multi Speakers*) but doesnt seem to be compatible with Vace. One key new feature compared to Multitalk is that you can have different visual shots associated to the same audio: each Reference frame you provide you will be associated to a new Sliding Window. If only Reference frame is provided, it will be used for all windows. When Continuing a video, you can either continue the current shot (no Reference Frame) or add new shots (one or more Reference Frames).\ +If you are not into audio, you can use still this model to generate infinite long image2video, just select "no speaker". Last but not least, Infinitetalk works works with all the Loras accelerators. +- **Flux Chroma 1 HD** support: uncensored flux based model and lighter than Flux (8.9B versus 12B) and can fit entirely in VRAM with only 16 GB of VRAM. Unfortunalely it is not distilled and you will need CFG at minimum 20 steps + +### August 21 2025: WanGP v8.01 - the killer of seven + +- **Qwen Image Edit** : Flux Kontext challenger (prompt driven image edition). Best results (including Identity preservation) will be obtained at 720p. Beyond you may get image outpainting and / or lose identity preservation. Below 720p prompt adherence will be worse. Qwen Image Edit works with Qwen Lora Lightning 4 steps. I have also unlocked all the resolutions for Qwen models. Bonus Zone: support for multiple image compositions but identity preservation won't be as good. +- **On demand Prompt Enhancer** (needs to be enabled in Configuration Tab) that you can use to Enhance a Text Prompt before starting a Generation. You can refine the Enhanced Prompt or change the original Prompt. +- Choice of a **Non censored Prompt Enhancer**. Beware this is one is VRAM hungry and will require 12 GB of VRAM to work +- **Memory Profile customizable per model** : useful to set for instance Profile 3 (preload the model entirely in VRAM) with only Image Generation models, if you have 24 GB of VRAM. In that case Generation will be much faster because with Image generators (contrary to Video generators) as a lot of time is wasted in offloading +- **Expert Guidance Mode**: change the Guidance during the generation up to 2 times. Very useful with Wan 2.2 Ligthning to reduce the slow motion effect. The idea is to insert a CFG phase before the 2 accelerated phases that follow and have no Guidance. I have added the finetune *Wan2.2 Vace Lightning 3 Phases 14B* with a prebuilt configuration. Please note that it is a 8 steps process although the lora lightning is 4 steps. This expert guidance mode is also available with Wan 2.1. + +*WanGP 8.01 update, improved Qwen Image Edit Identity Preservation* +### August 12 2025: WanGP v7.7777 - Lucky Day(s) + +This is your lucky day ! thanks to new configuration options that will let you store generated Videos and Images in lossless compressed formats, you will find they in fact they look two times better without doing anything ! + +Just kidding, they will be only marginally better, but at least this opens the way to professionnal editing. + +Support: +- Video: x264, x264 lossless, x265 +- Images: jpeg, png, webp, wbp lossless +Generation Settings are stored in each of the above regardless of the format (that was the hard part). + +Also you can now choose different output directories for images and videos. + +unexpected luck: fixed lightning 8 steps for Qwen, and lightning 4 steps for Wan 2.2, now you just need 1x multiplier no weird numbers. +*update 7.777 : oops got a crash a with FastWan ? Luck comes and goes, try a new update, maybe you will have a better chance this time* +*update 7.7777 : Sometime good luck seems to last forever. For instance what if Qwen Lightning 4 steps could also work with WanGP ?* +- https://huggingface.co/lightx2v/Qwen-Image-Lightning/resolve/main/Qwen-Image-Lightning-4steps-V1.0-bf16.safetensors (Qwen Lightning 4 steps) +- https://huggingface.co/lightx2v/Qwen-Image-Lightning/resolve/main/Qwen-Image-Lightning-8steps-V1.1-bf16.safetensors (new improved version of Qwen Lightning 8 steps) + + +### August 10 2025: WanGP v7.76 - Faster than the VAE ... +We have a funny one here today: FastWan 2.2 5B, the Fastest Video Generator, only 20s to generate 121 frames at 720p. The snag is that VAE is twice as slow... +Thanks to Kijai for extracting the Lora that is used to build the corresponding finetune. + +*WanGP 7.76: fixed the messed up I did to i2v models (loras path was wrong for Wan2.2 and Clip broken)* + +### August 9 2025: WanGP v7.74 - Qwen Rebirth part 2 +Added support for Qwen Lightning lora for a 8 steps generation (https://huggingface.co/lightx2v/Qwen-Image-Lightning/blob/main/Qwen-Image-Lightning-8steps-V1.0.safetensors). Lora is not normalized and you can use a multiplier around 0.1. + +Mag Cache support for all the Wan2.2 models Don't forget to set guidance to 1 and 8 denoising steps , your gen will be 7x faster ! + +### August 8 2025: WanGP v7.73 - Qwen Rebirth +Ever wondered what impact not using Guidance has on a model that expects it ? Just look at Qween Image in WanGP 7.71 whose outputs were erratic. Somehow I had convinced myself that Qwen was a distilled model. In fact Qwen was dying for a negative prompt. And in WanGP 7.72 there is at last one for him. + +As Qwen is not so picky after all I have added also quantized text encoder which reduces the RAM requirements of Qwen by 10 GB (the text encoder quantized version produced garbage before) + +Unfortunately still the Sage bug for older GPU architectures. Added Sdpa fallback for these architectures. + +*7.73 update: still Sage / Sage2 bug for GPUs before RTX40xx. I have added a detection mechanism that forces Sdpa attention if that's the case* + + +### August 6 2025: WanGP v7.71 - Picky, picky + +This release comes with two new models : +- Qwen Image: a Commercial grade Image generator capable to inject full sentences in the generated Image while still offering incredible visuals +- Wan 2.2 TextImage to Video 5B: the last Wan 2.2 needed if you want to complete your Wan 2.2 collection (loras for this folder can be stored in "\loras\5B" ) + +There is catch though, they are very picky if you want to get good generations: first they both need lots of steps (50 ?) to show what they have to offer. Then for Qwen Image I had to hardcode the supported resolutions, because if you try anything else, you will get garbage. Likewise Wan 2.2 5B will remind you of Wan 1.0 if you don't ask for at least 720p. + +*7.71 update: Added VAE Tiling for both Qwen Image and Wan 2.2 TextImage to Video 5B, for low VRAM during a whole gen.* + + +### August 4 2025: WanGP v7.6 - Remuxed + +With this new version you won't have any excuse if there is no sound in your video. + +*Continue Video* now works with any video that has already some sound (hint: Multitalk ). + +Also, on top of MMaudio and the various sound driven models I have added the ability to use your own soundtrack. + +As a result you can apply a different sound source on each new video segment when doing a *Continue Video*. + +For instance: +- first video part: use Multitalk with two people speaking +- second video part: you apply your own soundtrack which will gently follow the multitalk conversation +- third video part: you use Vace effect and its corresponding control audio will be concatenated to the rest of the audio + +To multiply the combinations I have also implemented *Continue Video* with the various image2video models. + +Also: +- End Frame support added for LTX Video models +- Loras can now be targetted specifically at the High noise or Low noise models with Wan 2.2, check the Loras and Finetune guides +- Flux Krea Dev support + +### July 30 2025: WanGP v7.5: Just another release ... Wan 2.2 part 2 +Here is now Wan 2.2 image2video a very good model if you want to set Start and End frames. Two Wan 2.2 models delivered, only one to go ... + +Please note that although it is an image2video model it is structurally very close to Wan 2.2 text2video (same layers with only a different initial projection). Given that Wan 2.1 image2video loras don't work too well (half of their tensors are not supported), I have decided that this model will look for its loras in the text2video loras folder instead of the image2video folder. + +I have also optimized RAM management with Wan 2.2 so that loras and modules will be loaded only once in RAM and Reserved RAM, this saves up to 5 GB of RAM which can make a difference... + +And this time I really removed Vace Cocktail Light which gave a blurry vision. + +### July 29 2025: WanGP v7.4: Just another release ... Wan 2.2 Preview +Wan 2.2 is here. The good news is that WanGP wont require a single byte of extra VRAM to run it and it will be as fast as Wan 2.1. The bad news is that you will need much more RAM if you want to leverage entirely this new model since it has twice has many parameters. + +So here is a preview version of Wan 2.2 that is without the 5B model and Wan 2.2 image to video for the moment. + +However as I felt bad to deliver only half of the wares, I gave you instead .....** Wan 2.2 Vace Experimental Cocktail** ! + +Very good surprise indeed, the loras and Vace partially work with Wan 2.2. We will need to wait for the official Vace 2.2 release since some Vace features are broken like identity preservation + +Bonus zone: Flux multi images conditions has been added, or maybe not if I broke everything as I have been distracted by Wan... + +7.4 update: I forgot to update the version number. I also removed Vace Cocktail light which didnt work well. + +### July 27 2025: WanGP v7.3 : Interlude +While waiting for Wan 2.2, you will appreciate the model selection hierarchy which is very useful to collect even more models. You will also appreciate that WanGP remembers which model you used last in each model family. + +### July 26 2025: WanGP v7.2 : Ode to Vace +I am really convinced that Vace can do everything the other models can do and in a better way especially as Vace can be combined with Multitalk. + +Here are some new Vace improvements: +- I have provided a default finetune named *Vace Cocktail* which is a model created on the fly using the Wan text 2 video model and the Loras used to build FusioniX. The weight of the *Detail Enhancer* Lora has been reduced to improve identity preservation. Copy the model definition in *defaults/vace_14B_cocktail.json* in the *finetunes/* folder to change the Cocktail composition. Cocktail contains already some Loras acccelerators so no need to add on top a Lora Accvid, Causvid or Fusionix, ... . The whole point of Cocktail is to be able to build you own FusioniX (which originally is a combination of 4 loras) but without the inconvenient of FusioniX. +- Talking about identity preservation, it tends to go away when one generates a single Frame instead of a Video which is shame for our Vace photoshop. But there is a solution : I have added an Advanced Quality option, that tells WanGP to generate a little more than a frame (it will still keep only the first frame). It will be a little slower but you will be amazed how Vace Cocktail combined with this option will preserve identities (bye bye *Phantom*). +- As in practise I have observed one switches frequently between *Vace text2video* and *Vace text2image* I have put them in the same place they are now just one tab away, no need to reload the model. Likewise *Wan text2video* and *Wan tex2image* have been merged. +- Color fixing when using Sliding Windows. A new postprocessing *Color Correction* applied automatically by default (you can disable it in the *Advanced tab Sliding Window*) will try to match the colors of the new window with that of the previous window. It doesnt fix all the unwanted artifacts of the new window but at least this makes the transition smoother. Thanks to the multitalk team for the original code. + +Also you will enjoy our new real time statistics (CPU / GPU usage, RAM / VRAM used, ... ). Many thanks to **Redtash1** for providing the framework for this new feature ! You need to go in the Config tab to enable real time stats. + + +### July 21 2025: WanGP v7.12 - Flux Family Reunion : *Flux Dev* and *Flux Schnell* have been invited aboard WanGP. To celebrate that, Loras support for the Flux *diffusers* format has also been added. -- LTX Video upgraded to version 0.9.8: you can now generate 1800 frames (1 min of video !) in one go without a sliding window. With the distilled model it will take only 5 minutes with a RTX 4090 (you will need 22 GB of VRAM though). I have added options to select higher humber frames if you want to experiment +- LTX Video upgraded to version 0.9.8: you can now generate 1800 frames (1 min of video !) in one go without a sliding window. With the distilled model it will take only 5 minutes with a RTX 4090 (you will need 22 GB of VRAM though). I have added options to select higher humber frames if you want to experiment (go to Configuration Tab / General / Increase the Max Number of Frames, change the value and restart the App) - LTX Video ControlNet : it is a Control Net that allows you for instance to transfer a Human motion or Depth from a control video. It is not as powerful as Vace but can produce interesting things especially as now you can generate quickly a 1 min video. Under the scene IC-Loras (see below) for Pose, Depth and Canny are automatically loaded for you, no need to add them. - LTX IC-Lora support: these are special Loras that consumes a conditional image or video Beside the pose, depth and canny IC-Loras transparently loaded there is the *detailer* (https://huggingface.co/Lightricks/LTX-Video-ICLoRA-detailer-13b-0.9.8) which is basically an upsampler. Add the *detailer* as a Lora and use LTX Raw Format as control net choice to use it. -And Also: -- easier way to select video resolution -- started to optimize Matanyone to reduce VRAM requirements +- Matanyone is now also for the GPU Poor as its VRAM requirements have been divided by 2! (7.12 shadow update) +- Easier way to select video resolution ### July 15 2025: WanGP v7.0 is an AI Powered Photoshop This release turns the Wan models into Image Generators. This goes way more than allowing to generate a video made of single frame : diff --git a/models/flux/flux_handler.py b/models/flux/flux_handler.py index 808369f..d168881 100644 --- a/models/flux/flux_handler.py +++ b/models/flux/flux_handler.py @@ -107,7 +107,7 @@ class family_handler(): ] @staticmethod - def load_model(model_filename, model_type, base_model_type, model_def, quantizeTransformer = False, text_encoder_quantization = None, dtype = torch.bfloat16, VAE_dtype = torch.float32, mixed_precision_transformer = False, save_quantized = False): + def load_model(model_filename, model_type, base_model_type, model_def, quantizeTransformer = False, text_encoder_quantization = None, dtype = torch.bfloat16, VAE_dtype = torch.float32, mixed_precision_transformer = False, save_quantized = False, submodel_no_list = None): from .flux_main import model_factory flux_model = model_factory( diff --git a/models/flux/sampling.py b/models/flux/sampling.py index 1b4813a..92ee590 100644 --- a/models/flux/sampling.py +++ b/models/flux/sampling.py @@ -203,7 +203,7 @@ def prepare_kontext( image_mask_latents = convert_image_to_tensor(img_mask.resize((target_width // 16, target_height // 16), resample=Image.Resampling.LANCZOS)) image_mask_latents = torch.where(image_mask_latents>-0.5, 1., 0. )[0:1] image_mask_rebuilt = image_mask_latents.repeat_interleave(16, dim=-1).repeat_interleave(16, dim=-2).unsqueeze(0) - convert_tensor_to_image( image_mask_rebuilt.squeeze(0).repeat(3,1,1)).save("mmm.png") + # convert_tensor_to_image( image_mask_rebuilt.squeeze(0).repeat(3,1,1)).save("mmm.png") image_mask_latents = image_mask_latents.reshape(1, -1, 1).to(device) return_dict.update({ "img_msk_latents": image_mask_latents, diff --git a/models/hyvideo/hunyuan_handler.py b/models/hyvideo/hunyuan_handler.py index 67e9e99..4352341 100644 --- a/models/hyvideo/hunyuan_handler.py +++ b/models/hyvideo/hunyuan_handler.py @@ -68,7 +68,13 @@ class family_handler(): "visible": False, } - if base_model_type in ["hunyuan_avatar"]: extra_model_def["no_background_removal"] = True + if base_model_type in ["hunyuan_avatar"]: + extra_model_def["image_ref_choices"] = { + "choices": [("Start Image", "KI")], + "letters_filter":"KI", + "visible": False, + } + extra_model_def["no_background_removal"] = True if base_model_type in ["hunyuan_custom", "hunyuan_custom_edit", "hunyuan_custom_audio", "hunyuan_avatar"]: extra_model_def["one_image_ref_needed"] = True @@ -123,7 +129,7 @@ class family_handler(): } @staticmethod - def load_model(model_filename, model_type = None, base_model_type = None, model_def = None, quantizeTransformer = False, text_encoder_quantization = None, dtype = torch.bfloat16, VAE_dtype = torch.float32, mixed_precision_transformer = False, save_quantized = False): + def load_model(model_filename, model_type = None, base_model_type = None, model_def = None, quantizeTransformer = False, text_encoder_quantization = None, dtype = torch.bfloat16, VAE_dtype = torch.float32, mixed_precision_transformer = False, save_quantized = False, submodel_no_list = None): from .hunyuan import HunyuanVideoSampler from mmgp import offload diff --git a/models/ltx_video/ltxv.py b/models/ltx_video/ltxv.py index db143fc..080860c 100644 --- a/models/ltx_video/ltxv.py +++ b/models/ltx_video/ltxv.py @@ -476,14 +476,14 @@ class LTXV: images = images.sub_(0.5).mul_(2).squeeze(0) return images - def get_loras_transformer(self, get_model_recursive_prop, video_prompt_type, **kwargs): + def get_loras_transformer(self, get_model_recursive_prop, model_type, video_prompt_type, **kwargs): map = { "P" : "pose", "D" : "depth", "E" : "canny", } loras = [] - preloadURLs = get_model_recursive_prop(self.model_type, "preload_URLs") + preloadURLs = get_model_recursive_prop(model_type, "preload_URLs") lora_file_name = "" for letter, signature in map.items(): if letter in video_prompt_type: diff --git a/models/ltx_video/ltxv_handler.py b/models/ltx_video/ltxv_handler.py index e44c983..2845fdb 100644 --- a/models/ltx_video/ltxv_handler.py +++ b/models/ltx_video/ltxv_handler.py @@ -74,7 +74,7 @@ class family_handler(): @staticmethod - def load_model(model_filename, model_type, base_model_type, model_def, quantizeTransformer = False, text_encoder_quantization = None, dtype = torch.bfloat16, VAE_dtype = torch.float32, mixed_precision_transformer = False, save_quantized = False): + def load_model(model_filename, model_type, base_model_type, model_def, quantizeTransformer = False, text_encoder_quantization = None, dtype = torch.bfloat16, VAE_dtype = torch.float32, mixed_precision_transformer = False, save_quantized = False, submodel_no_list = None): from .ltxv import LTXV ltxv_model = LTXV( diff --git a/models/qwen/pipeline_qwenimage.py b/models/qwen/pipeline_qwenimage.py index 0897ee4..be982aa 100644 --- a/models/qwen/pipeline_qwenimage.py +++ b/models/qwen/pipeline_qwenimage.py @@ -569,6 +569,8 @@ class QwenImagePipeline(): #DiffusionPipeline pipeline=None, loras_slists=None, joint_pass= True, + lora_inpaint = False, + outpainting_dims = None, ): r""" Function invoked when calling the pipeline for generation. @@ -704,7 +706,7 @@ class QwenImagePipeline(): #DiffusionPipeline image_height, image_width = calculate_new_dimensions(ref_height, ref_width, image_height, image_width, False, block_size=multiple_of) if (image_width,image_height) != image.size: image = image.resize((image_width,image_height), resample=Image.Resampling.LANCZOS) - else: + elif not lora_inpaint: # _, image_width, image_height = min( # (abs(aspect_ratio - w / h), w, h) for w, h in PREFERRED_QWENIMAGE_RESOLUTIONS # ) @@ -721,8 +723,16 @@ class QwenImagePipeline(): #DiffusionPipeline if image.size != (image_width, image_height): image = image.resize((image_width, image_height), resample=Image.Resampling.LANCZOS) + image = convert_image_to_tensor(image) + if lora_inpaint: + image_mask_rebuilt = torch.where(convert_image_to_tensor(image_mask)>-0.5, 1., 0. )[0:1] + image_mask_latents = None + green = torch.tensor([-1.0, 1.0, -1.0]).to(image) + green_image = green[:, None, None] .expand_as(image) + image = torch.where(image_mask_rebuilt > 0, green_image, image) + prompt_image = convert_tensor_to_image(image) + image = image.unsqueeze(0).unsqueeze(2) # image.save("nnn.png") - image = convert_image_to_tensor(image).unsqueeze(0).unsqueeze(2) has_neg_prompt = negative_prompt is not None or ( negative_prompt_embeds is not None and negative_prompt_embeds_mask is not None @@ -940,7 +950,7 @@ class QwenImagePipeline(): #DiffusionPipeline ) latents = latents / latents_std + latents_mean output_image = self.vae.decode(latents, return_dict=False)[0][:, :, 0] - if image_mask is not None: + if image_mask is not None and not lora_inpaint : #not (lora_inpaint and outpainting_dims is not None): output_image = image.squeeze(2) * (1 - image_mask_rebuilt) + output_image.to(image) * image_mask_rebuilt diff --git a/models/qwen/qwen_handler.py b/models/qwen/qwen_handler.py index 80a909f..010298e 100644 --- a/models/qwen/qwen_handler.py +++ b/models/qwen/qwen_handler.py @@ -1,4 +1,6 @@ import torch +import gradio as gr + def get_qwen_text_encoder_filename(text_encoder_quantization): text_encoder_filename = "ckpts/Qwen2.5-VL-7B-Instruct/Qwen2.5-VL-7B-Instruct_bf16.safetensors" @@ -29,6 +31,16 @@ class family_handler(): "letters_filter": "KI", } extra_model_def["background_removal_label"]= "Remove Backgrounds only behind People / Objects except main Subject / Landscape" + extra_model_def["video_guide_outpainting"] = [2] + extra_model_def["model_modes"] = { + "choices": [ + ("Lora Inpainting: Inpainted area completely unrelated to occulted content", 1), + ("Masked Denoising : Inpainted area may reuse some content that has been occulted", 0), + ], + "default": 1, + "label" : "Inpainting Method", + "image_modes" : [2], + } return extra_model_def @@ -58,7 +70,7 @@ class family_handler(): } @staticmethod - def load_model(model_filename, model_type, base_model_type, model_def, quantizeTransformer = False, text_encoder_quantization = None, dtype = torch.bfloat16, VAE_dtype = torch.float32, mixed_precision_transformer = False, save_quantized = False): + def load_model(model_filename, model_type, base_model_type, model_def, quantizeTransformer = False, text_encoder_quantization = None, dtype = torch.bfloat16, VAE_dtype = torch.float32, mixed_precision_transformer = False, save_quantized = False, submodel_no_list = None): from .qwen_main import model_factory from mmgp import offload @@ -99,5 +111,18 @@ class family_handler(): ui_defaults.update({ "video_prompt_type": "KI", "denoising_strength" : 1., + "model_mode" : 0, }) + def validate_generative_settings(base_model_type, model_def, inputs): + if base_model_type in ["qwen_image_edit_20B"]: + model_mode = inputs["model_mode"] + denoising_strength= inputs["denoising_strength"] + video_guide_outpainting= inputs["video_guide_outpainting"] + from wgp import get_outpainting_dims + outpainting_dims = get_outpainting_dims(video_guide_outpainting) + + if denoising_strength < 1 and model_mode == 1: + gr.Info("Denoising Strength will be ignored while using Lora Inpainting") + if outpainting_dims is not None and model_mode == 0 : + return "Outpainting is not supported with Masked Denoising " diff --git a/models/qwen/qwen_main.py b/models/qwen/qwen_main.py index e4c19ed..da84e0d 100644 --- a/models/qwen/qwen_main.py +++ b/models/qwen/qwen_main.py @@ -44,7 +44,7 @@ class model_factory(): save_quantized = False, dtype = torch.bfloat16, VAE_dtype = torch.float32, - mixed_precision_transformer = False + mixed_precision_transformer = False, ): @@ -117,6 +117,8 @@ class model_factory(): joint_pass = True, sample_solver='default', denoising_strength = 1., + model_mode = 0, + outpainting_dims = None, **bbargs ): # Generate with different aspect ratios @@ -205,8 +207,16 @@ class model_factory(): loras_slists=loras_slists, joint_pass = joint_pass, denoising_strength=denoising_strength, - generator=torch.Generator(device="cuda").manual_seed(seed) + generator=torch.Generator(device="cuda").manual_seed(seed), + lora_inpaint = image_mask is not None and model_mode == 1, + outpainting_dims = outpainting_dims, ) if image is None: return None return image.transpose(0, 1) + def get_loras_transformer(self, get_model_recursive_prop, model_type, model_mode, **kwargs): + if model_mode == 0: return [], [] + preloadURLs = get_model_recursive_prop(model_type, "preload_URLs") + return [os.path.join("ckpts", os.path.basename(preloadURLs[0]))] , [1] + + diff --git a/models/wan/any2video.py b/models/wan/any2video.py index 40ed42f..dde1a65 100644 --- a/models/wan/any2video.py +++ b/models/wan/any2video.py @@ -64,6 +64,7 @@ class WanAny2V: config, checkpoint_dir, model_filename = None, + submodel_no_list = None, model_type = None, model_def = None, base_model_type = None, @@ -126,50 +127,65 @@ class WanAny2V: forcedConfigPath = base_config_file if len(model_filename) > 1 else None # forcedConfigPath = base_config_file = f"configs/flf2v_720p.json" # model_filename[1] = xmodel_filename - + self.model = self.model2 = None source = model_def.get("source", None) + source2 = model_def.get("source2", None) module_source = model_def.get("module_source", None) + module_source2 = model_def.get("module_source2", None) if module_source is not None: - model_filename = [] + model_filename - model_filename[1] = module_source - self.model = offload.fast_load_transformers_model(model_filename, modelClass=WanModel,do_quantize= quantizeTransformer and not save_quantized, writable_tensors= False, defaultConfigPath=base_config_file , forcedConfigPath= forcedConfigPath) - elif source is not None: + self.model = offload.fast_load_transformers_model(model_filename[:1] + [module_source], modelClass=WanModel,do_quantize= quantizeTransformer and not save_quantized, writable_tensors= False, defaultConfigPath=base_config_file , forcedConfigPath= forcedConfigPath) + if module_source2 is not None: + self.model2 = offload.fast_load_transformers_model(model_filename[1:2] + [module_source2], modelClass=WanModel,do_quantize= quantizeTransformer and not save_quantized, writable_tensors= False, defaultConfigPath=base_config_file , forcedConfigPath= forcedConfigPath) + if source is not None: self.model = offload.fast_load_transformers_model(source, modelClass=WanModel, writable_tensors= False, forcedConfigPath= base_config_file) - elif self.transformer_switch: - shared_modules= {} - self.model = offload.fast_load_transformers_model(model_filename[:1], modules = model_filename[2:], modelClass=WanModel,do_quantize= quantizeTransformer and not save_quantized, writable_tensors= False, defaultConfigPath=base_config_file , forcedConfigPath= forcedConfigPath, return_shared_modules= shared_modules) - self.model2 = offload.fast_load_transformers_model(model_filename[1:2], modules = shared_modules, modelClass=WanModel,do_quantize= quantizeTransformer and not save_quantized, writable_tensors= False, defaultConfigPath=base_config_file , forcedConfigPath= forcedConfigPath) - shared_modules = None - else: - self.model = offload.fast_load_transformers_model(model_filename, modelClass=WanModel,do_quantize= quantizeTransformer and not save_quantized, writable_tensors= False, defaultConfigPath=base_config_file , forcedConfigPath= forcedConfigPath) - - # self.model = offload.load_model_data(self.model, xmodel_filename ) - # offload.load_model_data(self.model, "c:/temp/Phantom-Wan-1.3B.pth") + if source2 is not None: + self.model2 = offload.fast_load_transformers_model(source2, modelClass=WanModel, writable_tensors= False, forcedConfigPath= base_config_file) - self.model.lock_layers_dtypes(torch.float32 if mixed_precision_transformer else dtype) - offload.change_dtype(self.model, dtype, True) + if self.model is not None or self.model2 is not None: + from wgp import save_model + from mmgp.safetensors2 import torch_load_file + else: + if self.transformer_switch: + if 0 in submodel_no_list[2:] and 1 in submodel_no_list: + raise Exception("Shared and non shared modules at the same time across multipe models is not supported") + + if 0 in submodel_no_list[2:]: + shared_modules= {} + self.model = offload.fast_load_transformers_model(model_filename[:1], modules = model_filename[2:], modelClass=WanModel,do_quantize= quantizeTransformer and not save_quantized, writable_tensors= False, defaultConfigPath=base_config_file , forcedConfigPath= forcedConfigPath, return_shared_modules= shared_modules) + self.model2 = offload.fast_load_transformers_model(model_filename[1:2], modules = shared_modules, modelClass=WanModel,do_quantize= quantizeTransformer and not save_quantized, writable_tensors= False, defaultConfigPath=base_config_file , forcedConfigPath= forcedConfigPath) + shared_modules = None + else: + modules_for_1 =[ file_name for file_name, submodel_no in zip(model_filename[2:],submodel_no_list[2:] ) if submodel_no ==1 ] + modules_for_2 =[ file_name for file_name, submodel_no in zip(model_filename[2:],submodel_no_list[2:] ) if submodel_no ==2 ] + self.model = offload.fast_load_transformers_model(model_filename[:1], modules = modules_for_1, modelClass=WanModel,do_quantize= quantizeTransformer and not save_quantized, writable_tensors= False, defaultConfigPath=base_config_file , forcedConfigPath= forcedConfigPath) + self.model2 = offload.fast_load_transformers_model(model_filename[1:2], modules = modules_for_2, modelClass=WanModel,do_quantize= quantizeTransformer and not save_quantized, writable_tensors= False, defaultConfigPath=base_config_file , forcedConfigPath= forcedConfigPath) + + else: + self.model = offload.fast_load_transformers_model(model_filename, modelClass=WanModel,do_quantize= quantizeTransformer and not save_quantized, writable_tensors= False, defaultConfigPath=base_config_file , forcedConfigPath= forcedConfigPath) + + + if self.model is not None: + self.model.lock_layers_dtypes(torch.float32 if mixed_precision_transformer else dtype) + offload.change_dtype(self.model, dtype, True) + self.model.eval().requires_grad_(False) if self.model2 is not None: self.model2.lock_layers_dtypes(torch.float32 if mixed_precision_transformer else dtype) offload.change_dtype(self.model2, dtype, True) - - # offload.save_model(self.model, "wan2.1_text2video_1.3B_mbf16.safetensors", do_quantize= False, config_file_path=base_config_file, filter_sd=sd) - # offload.save_model(self.model, "wan2.2_image2video_14B_low_mbf16.safetensors", config_file_path=base_config_file) - # offload.save_model(self.model, "wan2.2_image2video_14B_low_quanto_mbf16_int8.safetensors", do_quantize=True, config_file_path=base_config_file) - self.model.eval().requires_grad_(False) - if self.model2 is not None: self.model2.eval().requires_grad_(False) + if module_source is not None: - from wgp import save_model - from mmgp.safetensors2 import torch_load_file - filter = list(torch_load_file(module_source)) - save_model(self.model, model_type, dtype, None, is_module=True, filter=filter) - elif not source is None: - from wgp import save_model - save_model(self.model, model_type, dtype, None) + save_model(self.model, model_type, dtype, None, is_module=True, filter=list(torch_load_file(module_source)), module_source_no=1) + if module_source2 is not None: + save_model(self.model2, model_type, dtype, None, is_module=True, filter=list(torch_load_file(module_source2)), module_source_no=2) + if not source is None: + save_model(self.model, model_type, dtype, None, submodel_no= 1) + if not source2 is None: + save_model(self.model2, model_type, dtype, None, submodel_no= 2) if save_quantized: from wgp import save_quantized_model - save_quantized_model(self.model, model_type, model_filename[0], dtype, base_config_file) + if self.model is not None: + save_quantized_model(self.model, model_type, model_filename[0], dtype, base_config_file) if self.model2 is not None: save_quantized_model(self.model2, model_type, model_filename[1], dtype, base_config_file, submodel_no=2) self.sample_neg_prompt = config.sample_neg_prompt @@ -307,7 +323,7 @@ class WanAny2V: canvas = canvas.to(device) return ref_img.to(device), canvas - def prepare_source(self, src_video, src_mask, src_ref_images, total_frames, image_size, device, keep_video_guide_frames= [], start_frame = 0, pre_src_video = None, inject_frames = [], outpainting_dims = None, any_background_ref = False): + def prepare_source(self, src_video, src_mask, src_ref_images, total_frames, image_size, device, keep_video_guide_frames= [], pre_src_video = None, inject_frames = [], outpainting_dims = None, any_background_ref = False): image_sizes = [] trim_video_guide = len(keep_video_guide_frames) def conv_tensor(t, device): @@ -659,13 +675,15 @@ class WanAny2V: inject_from_start = False if input_frames != None and denoising_strength < 1 : color_reference_frame = input_frames[:, -1:].clone() - if overlapped_latents != None: - overlapped_latents_frames_num = overlapped_latents.shape[2] - overlapped_frames_num = (overlapped_latents_frames_num-1) * 4 + 1 + if prefix_frames_count > 0: + overlapped_frames_num = prefix_frames_count + overlapped_latents_frames_num = (overlapped_latents_frames_num -1 // 4) + 1 + # overlapped_latents_frames_num = overlapped_latents.shape[2] + # overlapped_frames_num = (overlapped_latents_frames_num-1) * 4 + 1 else: overlapped_latents_frames_num = overlapped_frames_num = 0 if len(keep_frames_parsed) == 0 or image_outputs or (overlapped_frames_num + len(keep_frames_parsed)) == input_frames.shape[1] and all(keep_frames_parsed) : keep_frames_parsed = [] - injection_denoising_step = int(sampling_steps * (1. - denoising_strength) ) + injection_denoising_step = int( round(sampling_steps * (1. - denoising_strength),4) ) latent_keep_frames = [] if source_latents.shape[2] < lat_frames or len(keep_frames_parsed) > 0: inject_from_start = True diff --git a/models/wan/df_handler.py b/models/wan/df_handler.py index 39d0a70..7a5d3ea 100644 --- a/models/wan/df_handler.py +++ b/models/wan/df_handler.py @@ -78,7 +78,7 @@ class family_handler(): return family_handler.query_model_files(computeList, base_model_type, model_filename, text_encoder_quantization) @staticmethod - def load_model(model_filename, model_type, base_model_type, model_def, quantizeTransformer = False, text_encoder_quantization = None, dtype = torch.bfloat16, VAE_dtype = torch.float32, mixed_precision_transformer = False, save_quantized= False): + def load_model(model_filename, model_type, base_model_type, model_def, quantizeTransformer = False, text_encoder_quantization = None, dtype = torch.bfloat16, VAE_dtype = torch.float32, mixed_precision_transformer = False, save_quantized= False, submodel_no_list = None): from .configs import WAN_CONFIGS from .wan_handler import family_handler cfg = WAN_CONFIGS['t2v-14B'] diff --git a/models/wan/multitalk/multitalk.py b/models/wan/multitalk/multitalk.py index b04f65f..52d2dd9 100644 --- a/models/wan/multitalk/multitalk.py +++ b/models/wan/multitalk/multitalk.py @@ -214,18 +214,20 @@ def process_tts_multi(text, save_dir, voice1, voice2): return s1, s2, save_path_sum -def get_full_audio_embeddings(audio_guide1 = None, audio_guide2 = None, combination_type ="add", num_frames = 0, fps = 25, sr = 16000, padded_frames_for_embeddings = 0, min_audio_duration = 0): +def get_full_audio_embeddings(audio_guide1 = None, audio_guide2 = None, combination_type ="add", num_frames = 0, fps = 25, sr = 16000, padded_frames_for_embeddings = 0, min_audio_duration = 0, return_sum_only = False): wav2vec_feature_extractor, audio_encoder= custom_init('cpu', "ckpts/chinese-wav2vec2-base") # wav2vec_feature_extractor, audio_encoder= custom_init('cpu', "ckpts/wav2vec") pad = int(padded_frames_for_embeddings/ fps * sr) new_human_speech1, new_human_speech2, sum_human_speechs, duration_changed = audio_prepare_multi(audio_guide1, audio_guide2, combination_type, duration= num_frames / fps, pad = pad, min_audio_duration = min_audio_duration ) - audio_embedding_1 = get_embedding(new_human_speech1, wav2vec_feature_extractor, audio_encoder, sr=sr, fps= fps) - audio_embedding_2 = get_embedding(new_human_speech2, wav2vec_feature_extractor, audio_encoder, sr=sr, fps= fps) - full_audio_embs = [] - if audio_guide1 != None: full_audio_embs.append(audio_embedding_1) - # if audio_guide1 != None: full_audio_embs.append(audio_embedding_1) - if audio_guide2 != None: full_audio_embs.append(audio_embedding_2) - if audio_guide2 == None and not duration_changed: sum_human_speechs = None + if return_sum_only: + full_audio_embs = None + else: + audio_embedding_1 = get_embedding(new_human_speech1, wav2vec_feature_extractor, audio_encoder, sr=sr, fps= fps) + audio_embedding_2 = get_embedding(new_human_speech2, wav2vec_feature_extractor, audio_encoder, sr=sr, fps= fps) + full_audio_embs = [] + if audio_guide1 != None: full_audio_embs.append(audio_embedding_1) + if audio_guide2 != None: full_audio_embs.append(audio_embedding_2) + if audio_guide2 == None and not duration_changed: sum_human_speechs = None return full_audio_embs, sum_human_speechs diff --git a/models/wan/wan_handler.py b/models/wan/wan_handler.py index 253bd92..6586176 100644 --- a/models/wan/wan_handler.py +++ b/models/wan/wan_handler.py @@ -166,7 +166,8 @@ class family_handler(): extra_model_def["lock_image_refs_ratios"] = True extra_model_def["background_removal_label"]= "Remove Backgrounds behind People / Objects, keep it for Landscape or positioned Frames" - + extra_model_def["video_guide_outpainting"] = [0,1] + if base_model_type in ["standin"]: extra_model_def["lock_image_refs_ratios"] = True extra_model_def["image_ref_choices"] = { @@ -293,7 +294,7 @@ class family_handler(): @staticmethod - def load_model(model_filename, model_type, base_model_type, model_def, quantizeTransformer = False, text_encoder_quantization = None, dtype = torch.bfloat16, VAE_dtype = torch.float32, mixed_precision_transformer = False, save_quantized= False): + def load_model(model_filename, model_type, base_model_type, model_def, quantizeTransformer = False, text_encoder_quantization = None, dtype = torch.bfloat16, VAE_dtype = torch.float32, mixed_precision_transformer = False, save_quantized= False, submodel_no_list = None): from .configs import WAN_CONFIGS if test_class_i2v(base_model_type): @@ -306,6 +307,7 @@ class family_handler(): config=cfg, checkpoint_dir="ckpts", model_filename=model_filename, + submodel_no_list = submodel_no_list, model_type = model_type, model_def = model_def, base_model_type=base_model_type, @@ -381,7 +383,7 @@ class family_handler(): if base_model_type in ["fantasy"]: ui_defaults.update({ "audio_guidance_scale": 5.0, - "sliding_window_size": 1, + "sliding_window_overlap" : 1, }) elif base_model_type in ["multitalk"]: @@ -398,6 +400,7 @@ class family_handler(): "guidance_scale": 5.0, "flow_shift": 7, # 11 for 720p "sliding_window_overlap" : 9, + "sliding_window_size": 81, "sample_solver" : "euler", "video_prompt_type": "QKI", "remove_background_images_ref" : 0, diff --git a/preprocessing/extract_vocals.py b/preprocessing/extract_vocals.py new file mode 100644 index 0000000..6564026 --- /dev/null +++ b/preprocessing/extract_vocals.py @@ -0,0 +1,69 @@ +from pathlib import Path +import os, tempfile +import numpy as np +import soundfile as sf +import librosa +import torch +import gc + +from audio_separator.separator import Separator + +def get_vocals(src_path: str, dst_path: str, min_seconds: float = 8) -> str: + """ + If the source audio is shorter than `min_seconds`, pad with trailing silence + in a temporary file, then run separation and save only the vocals to dst_path. + Returns the full path to the vocals file. + """ + + default_device = torch.get_default_device() + torch.set_default_device('cpu') + + dst = Path(dst_path) + dst.parent.mkdir(parents=True, exist_ok=True) + + # Quick duration check + duration = librosa.get_duration(path=src_path) + + use_path = src_path + temp_path = None + try: + if duration < min_seconds: + # Load (resample) and pad in memory + y, sr = librosa.load(src_path, sr=None, mono=False) + if y.ndim == 1: # ensure shape (channels, samples) + y = y[np.newaxis, :] + target_len = int(min_seconds * sr) + pad = max(0, target_len - y.shape[1]) + if pad: + y = np.pad(y, ((0, 0), (0, pad)), mode="constant") + + # Write a temp WAV for the separator + fd, temp_path = tempfile.mkstemp(suffix=".wav") + os.close(fd) + sf.write(temp_path, y.T, sr) # soundfile expects (frames, channels) + use_path = temp_path + + # Run separation: emit only the vocals, with your exact filename + sep = Separator( + output_dir=str(dst.parent), + output_format=(dst.suffix.lstrip(".") or "wav"), + output_single_stem="Vocals", + model_file_dir="ckpts/roformer/" #model_bs_roformer_ep_317_sdr_12.9755.ckpt" + ) + sep.load_model() + out_files = sep.separate(use_path, {"Vocals": dst.stem}) + + out = Path(out_files[0]) + return str(out if out.is_absolute() else (dst.parent / out)) + finally: + if temp_path and os.path.exists(temp_path): + os.remove(temp_path) + + torch.cuda.empty_cache() + gc.collect() + torch.set_default_device(default_device) + +# Example: +# final = extract_vocals("in/clip.mp3", "out/vocals.wav") +# print(final) + diff --git a/preprocessing/speakers_separator.py b/preprocessing/speakers_separator.py index 79cde9b..d5f2563 100644 --- a/preprocessing/speakers_separator.py +++ b/preprocessing/speakers_separator.py @@ -100,7 +100,7 @@ class OptimizedPyannote31SpeakerSeparator: self.hf_token = hf_token self._overlap_pipeline = None - def separate_audio(self, audio_path: str, output1, output2 ) -> Dict[str, str]: + def separate_audio(self, audio_path: str, output1, output2, audio_original_path: str = None ) -> Dict[str, str]: """Optimized main separation function with memory management.""" xprint("Starting optimized audio separation...") self._current_audio_path = os.path.abspath(audio_path) @@ -128,7 +128,11 @@ class OptimizedPyannote31SpeakerSeparator: gc.collect() # Save outputs efficiently - output_paths = self._save_outputs_optimized(waveform, final_masks, sample_rate, audio_path, output1, output2) + if audio_original_path is None: + waveform_original = waveform + else: + waveform_original, sample_rate = self.load_audio(audio_original_path) + output_paths = self._save_outputs_optimized(waveform_original, final_masks, sample_rate, audio_path, output1, output2) return output_paths @@ -835,7 +839,7 @@ class OptimizedPyannote31SpeakerSeparator: for turn, _, speaker in diarization.itertracks(yield_label=True): xprint(f"{speaker}: {turn.start:.1f}s - {turn.end:.1f}s") -def extract_dual_audio(audio, output1, output2, verbose = False): +def extract_dual_audio(audio, output1, output2, verbose = False, audio_original = None): global verbose_output verbose_output = verbose separator = OptimizedPyannote31SpeakerSeparator( @@ -848,7 +852,7 @@ def extract_dual_audio(audio, output1, output2, verbose = False): import time start_time = time.time() - outputs = separator.separate_audio(audio, output1, output2) + outputs = separator.separate_audio(audio, output1, output2, audio_original) elapsed_time = time.time() - start_time xprint(f"\n=== SUCCESS (completed in {elapsed_time:.2f}s) ===") diff --git a/requirements.txt b/requirements.txt index 1f008c7..e6c85fd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -21,14 +21,15 @@ mutagen pyloudnorm librosa==0.11.0 speechbrain==1.0.3 - +audio-separator==0.36.1 + # UI & interaction gradio==5.29.0 dashscope loguru # Vision & segmentation -opencv-python>=4.9.0.80 +opencv-python>=4.12.0.88 segment-anything rembg[gpu]==2.0.65 onnxruntime-gpu diff --git a/wgp.py b/wgp.py index d085be7..de8e2f6 100644 --- a/wgp.py +++ b/wgp.py @@ -23,6 +23,7 @@ import importlib from shared.utils import notification_sound from shared.utils.loras_mutipliers import preparse_loras_multipliers, parse_loras_multipliers from shared.utils.utils import convert_tensor_to_image, save_image, get_video_info, get_file_creation_date, convert_image_to_video, calculate_new_dimensions, convert_image_to_tensor, calculate_dimensions_and_resize_image, rescale_and_crop, get_video_frame, resize_and_remove_background, rgb_bw_to_rgba_mask +from shared.utils.utils import calculate_new_dimensions, get_outpainting_frame_location, get_outpainting_full_area_dimensions from shared.utils.audio_video import extract_audio_tracks, combine_video_with_audio_tracks, combine_and_concatenate_video_with_audio_tracks, cleanup_temp_audio_files, save_video, save_image from shared.utils.audio_video import save_image_metadata, read_image_metadata from shared.match_archi import match_nvidia_architecture @@ -61,8 +62,8 @@ AUTOSAVE_FILENAME = "queue.zip" PROMPT_VARS_MAX = 10 target_mmgp_version = "3.6.0" -WanGP_version = "8.55" -settings_version = 2.34 +WanGP_version = "8.6" +settings_version = 2.35 max_source_video_frames = 3000 prompt_enhancer_image_caption_model, prompt_enhancer_image_caption_processor, prompt_enhancer_llm_model, prompt_enhancer_llm_tokenizer = None, None, None, None @@ -199,9 +200,11 @@ def clean_image_list(gradio_list): def process_prompt_and_add_tasks(state, model_choice): - + def ret(): + return gr.update(), gr.update() + if state.get("validate_success",0) != 1: - return + ret() state["validate_success"] = 0 model_filename = state["model_filename"] @@ -218,7 +221,7 @@ def process_prompt_and_add_tasks(state, model_choice): if inputs == None: gr.Warning("Internal state error: Could not retrieve inputs for the model.") queue = gen.get("queue", []) - return get_queue_table(queue) + return ret() model_def = get_model_def(model_type) model_handler = get_model_handler(model_type) image_outputs = inputs["image_mode"] > 0 @@ -247,7 +250,7 @@ def process_prompt_and_add_tasks(state, model_choice): if len(temporal_upsampling) >0: prompt += ["Temporal Upsampling"] if has_image_file_extension(edit_video_source) and len(temporal_upsampling) > 0: gr.Info("Temporal Upsampling can not be used with an Image") - return + return ret() film_grain_intensity = inputs.get("film_grain_intensity",0) film_grain_saturation = inputs.get("film_grain_saturation",0.5) # if film_grain_intensity >0: prompt += [f"Film Grain: intensity={film_grain_intensity}, saturation={film_grain_saturation}"] @@ -263,7 +266,7 @@ def process_prompt_and_add_tasks(state, model_choice): else: if audio_source is None: gr.Info("You must provide a custom Audio") - return + return ret() prompt += ["Custom Audio"] repeat_generation == 1 @@ -273,32 +276,32 @@ def process_prompt_and_add_tasks(state, model_choice): gr.Info("You must choose at least one Remux Method") else: gr.Info("You must choose at least one Post Processing Method") - return + return ret() inputs["prompt"] = ", ".join(prompt) add_video_task(**inputs) gen["prompts_max"] = 1 + gen.get("prompts_max",0) state["validate_success"] = 1 queue= gen.get("queue", []) - return update_queue_data(queue) + return ret() if hasattr(model_handler, "validate_generative_settings"): error = model_handler.validate_generative_settings(model_type, model_def, inputs) if error is not None and len(error) > 0: gr.Info(error) - return + return ret() if inputs.get("cfg_star_switch", 0) != 0 and inputs.get("apg_switch", 0) != 0: gr.Info("Adaptive Progressive Guidance and Classifier Free Guidance Star can not be set at the same time") - return + return ret() prompt = inputs["prompt"] if len(prompt) ==0: gr.Info("Prompt cannot be empty.") gen = get_gen_info(state) queue = gen.get("queue", []) - return get_queue_table(queue) + return ret() prompt, errors = prompt_parser.process_template(prompt) if len(errors) > 0: gr.Info("Error processing prompt template: " + errors) - return + return ret() model_filename = get_model_filename(model_type) prompts = prompt.replace("\r", "").split("\n") prompts = [prompt.strip() for prompt in prompts if len(prompt.strip())>0 and not prompt.startswith("#")] @@ -306,7 +309,7 @@ def process_prompt_and_add_tasks(state, model_choice): gr.Info("Prompt cannot be empty.") gen = get_gen_info(state) queue = gen.get("queue", []) - return get_queue_table(queue) + return ret() resolution = inputs["resolution"] width, height = resolution.split("x") @@ -360,22 +363,22 @@ def process_prompt_and_add_tasks(state, model_choice): _, _, errors = parse_loras_multipliers(loras_multipliers, len(activated_loras), num_inference_steps, nb_phases= guidance_phases) if len(errors) > 0: gr.Info(f"Error parsing Loras Multipliers: {errors}") - return + return ret() if guidance_phases == 3: if switch_threshold < switch_threshold2: gr.Info(f"Phase 1-2 Switch Noise Level ({switch_threshold}) should be Greater than Phase 2-3 Switch Noise Level ({switch_threshold2}). As a reminder, noise will gradually go down from 1000 to 0.") - return + return ret() else: model_switch_phase = 1 if not any_steps_skipping: skip_steps_cache_type = "" if not model_def.get("lock_inference_steps", False) and model_type in ["ltxv_13B"] and num_inference_steps < 20: gr.Info("The minimum number of steps should be 20") - return + return ret() if skip_steps_cache_type == "mag": if num_inference_steps > 50: gr.Info("Mag Cache maximum number of steps is 50") - return + return ret() if image_mode > 0: audio_prompt_type = "" @@ -385,7 +388,7 @@ def process_prompt_and_add_tasks(state, model_choice): speakers_bboxes, error = parse_speakers_locations(speakers_locations) if len(error) > 0: gr.Info(error) - return + return ret() if MMAudio_setting != 0 and server_config.get("mmaudio_enabled", 0) != 0 and video_length <16: #should depend on the architecture gr.Info("MMAudio can generate an Audio track only if the Video is at least 1s long") @@ -393,23 +396,24 @@ def process_prompt_and_add_tasks(state, model_choice): if len(frames_positions.strip()) > 0: positions = frames_positions.split(" ") for pos_str in positions: - if not is_integer(pos_str): - gr.Info(f"Invalid Frame Position '{pos_str}'") - return - pos = int(pos_str) - if pos <1 or pos > max_source_video_frames: - gr.Info(f"Invalid Frame Position Value'{pos_str}'") - return + if not pos_str in ["L", "l"] and len(pos_str)>0: + if not is_integer(pos_str): + gr.Info(f"Invalid Frame Position '{pos_str}'") + return ret() + pos = int(pos_str) + if pos <1 or pos > max_source_video_frames: + gr.Info(f"Invalid Frame Position Value'{pos_str}'") + return ret() else: frames_positions = None if audio_source is not None and MMAudio_setting != 0: gr.Info("MMAudio and Custom Audio Soundtrack can't not be used at the same time") - return + return ret() if len(filter_letters(image_prompt_type, "VLG")) > 0 and len(keep_frames_video_source) > 0: if not is_integer(keep_frames_video_source) or int(keep_frames_video_source) == 0: gr.Info("The number of frames to keep must be a non null integer") - return + return ret() else: keep_frames_video_source = "" @@ -419,18 +423,18 @@ def process_prompt_and_add_tasks(state, model_choice): if "V" in image_prompt_type: if video_source == None: gr.Info("You must provide a Source Video file to continue") - return + return ret() else: video_source = None if "A" in audio_prompt_type: if audio_guide == None: gr.Info("You must provide an Audio Source") - return + return ret() if "B" in audio_prompt_type: if audio_guide2 == None: gr.Info("You must provide a second Audio Source") - return + return ret() else: audio_guide2 = None else: @@ -444,23 +448,23 @@ def process_prompt_and_add_tasks(state, model_choice): if model_def.get("one_image_ref_needed", False): if image_refs == None : gr.Info("You must provide an Image Reference") - return + return ret() if len(image_refs) > 1: gr.Info("Only one Image Reference (a person) is supported for the moment by this model") - return + return ret() if model_def.get("at_least_one_image_ref_needed", False): if image_refs == None : gr.Info("You must provide at least one Image Reference") - return + return ret() if "I" in video_prompt_type: if image_refs == None or len(image_refs) == 0: gr.Info("You must provide at least one Reference Image") - return + return ret() image_refs = clean_image_list(image_refs) if image_refs == None : gr.Info("A Reference Image should be an Image") - return + return ret() else: image_refs = None @@ -468,35 +472,36 @@ def process_prompt_and_add_tasks(state, model_choice): if image_outputs: if image_guide is None: gr.Info("You must provide a Control Image") - return + return ret() else: if video_guide is None: gr.Info("You must provide a Control Video") - return + return ret() if "A" in video_prompt_type and not "U" in video_prompt_type: if image_outputs: if image_mask is None: gr.Info("You must provide a Image Mask") - return + return ret() else: if video_mask is None: gr.Info("You must provide a Video Mask") - return + return ret() else: video_mask = None image_mask = None if "G" in video_prompt_type: - gr.Info(f"With Denoising Strength {denoising_strength:.1f}, denoising will start at Step no {int(num_inference_steps * (1. - denoising_strength))} ") + if denoising_strength < 1.: + gr.Info(f"With Denoising Strength {denoising_strength:.1f}, denoising will start at Step no {int(round(num_inference_steps * (1. - denoising_strength),4))} ") else: denoising_strength = 1.0 if len(keep_frames_video_guide) > 0 and model_type in ["ltxv_13B"]: gr.Info("Keep Frames for Control Video is not supported with LTX Video") - return + return ret() _, error = parse_keep_frames_video_guide(keep_frames_video_guide, video_length) if len(error) > 0: gr.Info(f"Invalid Keep Frames property: {error}") - return + return ret() else: video_guide = None image_guide = None @@ -516,14 +521,14 @@ def process_prompt_and_add_tasks(state, model_choice): if "S" in image_prompt_type: if image_start == None or isinstance(image_start, list) and len(image_start) == 0: gr.Info("You must provide a Start Image") - return + return ret() image_start = clean_image_list(image_start) if image_start == None : gr.Info("Start Image should be an Image") - return + return ret() if multi_prompts_gen_type == 1 and len(image_start) > 1: gr.Info("Only one Start Image is supported") - return + return ret() else: image_start = None @@ -532,19 +537,19 @@ def process_prompt_and_add_tasks(state, model_choice): if "E" in image_prompt_type: if image_end == None or isinstance(image_end, list) and len(image_end) == 0: gr.Info("You must provide an End Image") - return + return ret() image_end = clean_image_list(image_end) if image_end == None : gr.Info("End Image should be an Image") - return + return ret() if multi_prompts_gen_type == 0: if video_source is not None: if len(image_end)> 1: gr.Info("If a Video is to be continued and the option 'Each Text Prompt Will create a new generated Video' is set, there can be only one End Image") - return + return ret() elif len(image_start or []) != len(image_end or []): gr.Info("The number of Start and End Images should be the same when the option 'Each Text Prompt Will create a new generated Video'") - return + return ret() else: image_end = None @@ -553,7 +558,7 @@ def process_prompt_and_add_tasks(state, model_choice): if video_length > sliding_window_size: if model_type in ["t2v"] and not "G" in video_prompt_type : gr.Info(f"You have requested to Generate Sliding Windows with a Text to Video model. Unless you use the Video to Video feature this is useless as a t2v model doesn't see past frames and it will generate the same video in each new window.") - return + return ret() full_video_length = video_length if video_source is None else video_length + sliding_window_overlap -1 extra = "" if full_video_length == video_length else f" including {sliding_window_overlap} added for Video Continuation" no_windows = compute_sliding_window_no(full_video_length, sliding_window_size, sliding_window_discard_last_frames, sliding_window_overlap) @@ -561,22 +566,22 @@ def process_prompt_and_add_tasks(state, model_choice): if "recam" in model_filename: if video_guide == None: gr.Info("You must provide a Control Video") - return + return ret() computed_fps = get_computed_fps(force_fps, model_type , video_guide, video_source ) frames = get_resampled_video(video_guide, 0, 81, computed_fps) if len(frames)<81: gr.Info(f"Recammaster Control video should be at least 81 frames once the resampling at {computed_fps} fps has been done") - return + return ret() if "hunyuan_custom_custom_edit" in model_filename: if len(keep_frames_video_guide) > 0: gr.Info("Filtering Frames with this model is not supported") - return + return ret() if inputs["multi_prompts_gen_type"] != 0: if image_start != None and len(image_start) > 1: gr.Info("Only one Start Image must be provided if multiple prompts are used for different windows") - return + return ret() # if image_end != None and len(image_end) > 1: # gr.Info("Only one End Image must be provided if multiple prompts are used for different windows") @@ -624,7 +629,7 @@ def process_prompt_and_add_tasks(state, model_choice): if len(prompts) >= len(image_start): if len(prompts) % len(image_start) != 0: gr.Info("If there are more text prompts than input images the number of text prompts should be dividable by the number of images") - return + return ret() rep = len(prompts) // len(image_start) new_image_start = [] new_image_end = [] @@ -638,7 +643,7 @@ def process_prompt_and_add_tasks(state, model_choice): else: if len(image_start) % len(prompts) !=0: gr.Info("If there are more input images than text prompts the number of images should be dividable by the number of text prompts") - return + return ret() rep = len(image_start) // len(prompts) new_prompts = [] for i, _ in enumerate(image_start): @@ -666,11 +671,13 @@ def process_prompt_and_add_tasks(state, model_choice): override_inputs["prompt"] = "\n".join(prompts) inputs.update(override_inputs) add_video_task(**inputs) - - gen["prompts_max"] = new_prompts_count + gen.get("prompts_max",0) + new_prompts_count += gen.get("prompts_max",0) + gen["prompts_max"] = new_prompts_count state["validate_success"] = 1 queue= gen.get("queue", []) - return update_queue_data(queue) + first_time_in_queue = state.get("first_time_in_queue", True) + state["first_time_in_queue"] = True + return update_queue_data(queue, first_time_in_queue), gr.update(open=True) if new_prompts_count > 1 else gr.update() def get_preview_images(inputs): inputs_to_query = ["image_start", "video_source", "image_end", "video_guide", "image_guide", "video_mask", "image_mask", "image_refs" ] @@ -724,7 +731,6 @@ def add_video_task(**inputs): "start_image_data_base64": [pil_to_base64_uri(img, format="jpeg", quality=70) for img in start_image_data] if start_image_data != None else None, "end_image_data_base64": [pil_to_base64_uri(img, format="jpeg", quality=70) for img in end_image_data] if end_image_data != None else None }) - return update_queue_data(queue) def update_task_thumbnails(task, inputs): start_image_data, end_image_data, start_labels, end_labels = get_preview_images(inputs) @@ -1341,16 +1347,12 @@ def get_queue_table(queue): "✖" ]) return data -def update_queue_data(queue): +def update_queue_data(queue, first_time_in_queue =False): update_global_queue_ref(queue) data = get_queue_table(queue) - if len(data) == 0: - return gr.DataFrame(value=[], max_height=1) - elif len(data) == 1: - return gr.DataFrame(value=data, max_height= 83) - else: - return gr.DataFrame(value=data, max_height= 1000) + return gr.DataFrame(value=data) + def create_html_progress_bar(percentage=0.0, text="Idle", is_idle=True): bar_class = "progress-bar-custom idle" if is_idle else "progress-bar-custom" @@ -2004,8 +2006,10 @@ def get_model_recursive_prop(model_type, prop = "URLs", sub_prop_name = None, re raise Exception(f"Unknown model type '{model_type}'") -def get_model_filename(model_type, quantization ="int8", dtype_policy = "", module_type = None, submodel_no = 1, stack=[]): - if module_type is not None: +def get_model_filename(model_type, quantization ="int8", dtype_policy = "", module_type = None, submodel_no = 1, URLs = None, stack=[]): + if URLs is not None: + pass + elif module_type is not None: base_model_type = get_base_model_type(model_type) # model_type_handler = model_types_handlers[base_model_type] # modules_files = model_type_handler.query_modules_files() if hasattr(model_type_handler, "query_modules_files") else {} @@ -2032,7 +2036,8 @@ def get_model_filename(model_type, quantization ="int8", dtype_policy = "", modu if len(stack) > 10: raise Exception(f"Circular Reference in Model {key_name} dependencies: {stack}") return get_model_filename(URLs, quantization=quantization, dtype_policy=dtype_policy, submodel_no = submodel_no, stack = stack + [URLs]) - choices = [ ("ckpts/" + os.path.basename(path) if path.startswith("http") else path) for path in URLs ] + # choices = [ ("ckpts/" + os.path.basename(path) if path.startswith("http") else path) for path in URLs ] + choices = URLs if len(quantization) == 0: quantization = "bf16" @@ -2042,13 +2047,13 @@ def get_model_filename(model_type, quantization ="int8", dtype_policy = "", modu raw_filename = choices[0] else: if quantization in ("int8", "fp8"): - sub_choices = [ name for name in choices if quantization in name or quantization.upper() in name] + sub_choices = [ name for name in choices if quantization in os.path.basename(name) or quantization.upper() in os.path.basename(name)] else: - sub_choices = [ name for name in choices if "quanto" not in name] + sub_choices = [ name for name in choices if "quanto" not in os.path.basename(name)] if len(sub_choices) > 0: dtype_str = "fp16" if dtype == torch.float16 else "bf16" - new_sub_choices = [ name for name in sub_choices if dtype_str in name or dtype_str.upper() in name] + new_sub_choices = [ name for name in sub_choices if dtype_str in os.path.basename(name) or dtype_str.upper() in os.path.basename(name)] sub_choices = new_sub_choices if len(new_sub_choices) > 0 else sub_choices raw_filename = sub_choices[0] else: @@ -2107,6 +2112,10 @@ def fix_settings(model_type, ui_defaults): audio_prompt_type ="A" ui_defaults["audio_prompt_type"] = audio_prompt_type + if settings_version < 2.35 and any_audio_track(base_model_type): + audio_prompt_type = audio_prompt_type or "" + audio_prompt_type += "V" + ui_defaults["audio_prompt_type"] = audio_prompt_type video_prompt_type = ui_defaults.get("video_prompt_type", "") @@ -2156,13 +2165,14 @@ def fix_settings(model_type, ui_defaults): video_prompt_type = ui_defaults.get("video_prompt_type", "") image_ref_choices_list = model_def.get("image_ref_choices", {}).get("choices", []) - if len(image_ref_choices_list)==0: - video_prompt_type = del_in_sequence(video_prompt_type, "IK") - else: - first_choice = image_ref_choices_list[0][1] - if "I" in first_choice and not "I" in video_prompt_type: video_prompt_type += "I" - if len(image_ref_choices_list)==1 and "K" in first_choice and not "K" in video_prompt_type: video_prompt_type += "K" - ui_defaults["video_prompt_type"] = video_prompt_type + if model_def.get("guide_custom_choices", None) is None: + if len(image_ref_choices_list)==0: + video_prompt_type = del_in_sequence(video_prompt_type, "IK") + else: + first_choice = image_ref_choices_list[0][1] + if "I" in first_choice and not "I" in video_prompt_type: video_prompt_type += "I" + if len(image_ref_choices_list)==1 and "K" in first_choice and not "K" in video_prompt_type: video_prompt_type += "K" + ui_defaults["video_prompt_type"] = video_prompt_type model_handler = get_model_handler(base_model_type) if hasattr(model_handler, "fix_settings"): @@ -2200,7 +2210,8 @@ def get_default_settings(model_type): "slg_switch": 0, "slg_layers": [9], "slg_start_perc": 10, - "slg_end_perc": 90 + "slg_end_perc": 90, + "audio_prompt_type": "V", } model_handler = get_model_handler(model_type) model_handler.update_default_settings(base_model_type, model_def, ui_defaults) @@ -2348,7 +2359,7 @@ if args.compile: #args.fastest or lock_ui_compile = True -def save_model(model, model_type, dtype, config_file, submodel_no = 1, is_module = False, filter = None, no_fp16_main_model = True ): +def save_model(model, model_type, dtype, config_file, submodel_no = 1, is_module = False, filter = None, no_fp16_main_model = True, module_source_no = 1): model_def = get_model_def(model_type) # To save module and quantized modules # 1) set Transformer Model Quantization Type to 16 bits @@ -2359,10 +2370,10 @@ def save_model(model, model_type, dtype, config_file, submodel_no = 1, is_mod if model_def == None: return if is_module: url_key = "modules" - source_key = "module_source" + source_key = "module_source" if module_source_no <=1 else "module_source2" else: url_key = "URLs" if submodel_no <=1 else "URLs" + str(submodel_no) - source_key = "source" + source_key = "source" if submodel_no <=1 else "source2" URLs= model_def.get(url_key, None) if URLs is None: return if isinstance(URLs, str): @@ -2377,6 +2388,9 @@ def save_model(model, model_type, dtype, config_file, submodel_no = 1, is_mod print("Target Module files are missing") return URLs= URLs[0] + if isinstance(URLs, dict): + url_dict_key = "URLs" if module_source_no ==1 else "URLs2" + URLs = URLs[url_dict_key] for url in URLs: if "quanto" not in url and dtypestr in url: model_filename = os.path.basename(url) @@ -2410,8 +2424,12 @@ def save_model(model, model_type, dtype, config_file, submodel_no = 1, is_mod elif not os.path.isfile(quanto_filename): offload.save_model(model, quanto_filename, config_file_path=config_file, do_quantize= True, filter_sd=filter) print(f"New quantized file '{quanto_filename}' had been created for finetune Id '{model_type}'.") - model_def[url_key][0].append(quanto_filename) - saved_finetune_def["model"][url_key][0].append(quanto_filename) + if isinstance(model_def[url_key][0],dict): + model_def[url_key][0][url_dict_key].append(quanto_filename) + saved_finetune_def["model"][url_key][0][url_dict_key].append(quanto_filename) + else: + model_def[url_key][0].append(quanto_filename) + saved_finetune_def["model"][url_key][0].append(quanto_filename) update_model_def = True if update_model_def: with open(finetune_file, "w", encoding="utf-8") as writer: @@ -2466,6 +2484,14 @@ def get_loras_preprocessor(transformer, model_type): return preprocessor_wrapper +def get_local_model_filename(model_filename): + if model_filename.startswith("http"): + local_model_filename = os.path.join("ckpts", os.path.basename(model_filename)) + else: + local_model_filename = model_filename + return local_model_filename + + def process_files_def(repoId, sourceFolderList, fileList): targetRoot = "ckpts/" @@ -2491,7 +2517,7 @@ def download_mmaudio(): } process_files_def(**enhancer_def) -def download_models(model_filename = None, model_type= None, module_type = None, submodel_no = 1): +def download_models(model_filename = None, model_type= None, module_type = False, submodel_no = 1): def computeList(filename): if filename == None: return [] @@ -2506,11 +2532,12 @@ def download_models(model_filename = None, model_type= None, module_type = None, shared_def = { "repoId" : "DeepBeepMeep/Wan2.1", - "sourceFolderList" : [ "pose", "scribble", "flow", "depth", "mask", "wav2vec", "chinese-wav2vec2-base", "pyannote", "det_align", "" ], + "sourceFolderList" : [ "pose", "scribble", "flow", "depth", "mask", "wav2vec", "chinese-wav2vec2-base", "roformer", "pyannote", "det_align", "" ], "fileList" : [ ["dw-ll_ucoco_384.onnx", "yolox_l.onnx"],["netG_A_latest.pth"], ["raft-things.pth"], ["depth_anything_v2_vitl.pth","depth_anything_v2_vitb.pth"], ["sam_vit_h_4b8939_fp16.safetensors"], ["config.json", "feature_extractor_config.json", "model.safetensors", "preprocessor_config.json", "special_tokens_map.json", "tokenizer_config.json", "vocab.json"], ["config.json", "pytorch_model.bin", "preprocessor_config.json"], + ["model_bs_roformer_ep_317_sdr_12.9755.ckpt", "model_bs_roformer_ep_317_sdr_12.9755.yaml", "download_checks.json"], ["pyannote_model_wespeaker-voxceleb-resnet34-LM.bin", "pytorch_model_segmentation-3.0.bin"], ["detface.pt"], [ "flownet.pkl" ] ] } process_files_def(**shared_def) @@ -2558,37 +2585,25 @@ def download_models(model_filename = None, model_type= None, module_type = None, base_model_type = get_base_model_type(model_type) model_def = get_model_def(model_type) - source = model_def.get("source", None) - module_source = model_def.get("module_source", None) + any_source = ("source2" if submodel_no ==2 else "source") in model_def + any_module_source = ("module_source2" if submodel_no ==2 else "module_source") in model_def model_type_handler = model_types_handlers[base_model_type] - - if source is not None and module_type is None or module_source is not None and module_type is not None: + local_model_filename = get_local_model_filename(model_filename) + + if any_source and not module_type or any_module_source and module_type: model_filename = None else: - if not os.path.isfile(model_filename): - if module_type is not None: - key_name = "modules" - URLs = module_type - if isinstance(module_type, str): - URLs = get_model_recursive_prop(module_type, key_name, sub_prop_name="_list", return_list= False) - else: - key_name = "URLs" if submodel_no <= 1 else f"URLs{submodel_no}" - URLs = get_model_recursive_prop(model_type, key_name, return_list= False) - if isinstance(URLs, str): - raise Exception("Missing model " + URLs) - use_url = model_filename - for url in URLs: - if os.path.basename(model_filename) in url: - use_url = url - break - if not url.startswith("http"): - raise Exception(f"Model '{model_filename}' in field '{key_name}' was not found locally and no URL was provided to download it. Please add an URL in the model definition file.") - try: - download_file(use_url, model_filename) - except Exception as e: - if os.path.isfile(model_filename): os.remove(model_filename) - raise Exception(f"{key_name} '{use_url}' is invalid for Model '{model_filename}' : {str(e)}'") + if not os.path.isfile(local_model_filename): + url = model_filename + if not url.startswith("http"): + raise Exception(f"Model '{model_filename}' was not found locally and no URL was provided to download it. Please add an URL in the model definition file.") + try: + download_file(url, local_model_filename) + except Exception as e: + if os.path.isfile(local_model_filename): os.remove(local_model_filename) + raise Exception(f"'{url}' is invalid for Model '{local_model_filename}' : {str(e)}'") + if module_type: return model_filename = None preload_URLs = get_model_recursive_prop(model_type, "preload_URLs", return_list= True) @@ -2614,6 +2629,7 @@ def download_models(model_filename = None, model_type= None, module_type = None, except Exception as e: if os.path.isfile(filename): os.remove(filename) raise Exception(f"Lora URL '{url}' is invalid: {str(e)}'") + if module_type: return model_files = model_type_handler.query_model_files(computeList, base_model_type, model_filename, text_encoder_quantization) if not isinstance(model_files, list): model_files = [model_files] for one_repo in model_files: @@ -2800,7 +2816,8 @@ def load_models(model_type, override_profile = -1): model_filename2 = get_model_filename(model_type=model_type, quantization= "" if save_quantized else transformer_quantization, dtype_policy = transformer_dtype_policy, submodel_no=2) # !!!! else: model_filename2 = None - modules = get_model_recursive_prop(model_type, "modules", return_list= True) + modules = get_model_recursive_prop(model_type, "modules", return_list= True) + modules = [get_model_recursive_prop(module, "modules", sub_prop_name ="_list", return_list= True) if isinstance(module, str) else module for module in modules ] if save_quantized and "quanto" in model_filename: save_quantized = False print("Need to provide a non quantized model to create a quantized model to be saved") @@ -2825,27 +2842,40 @@ def load_models(model_type, override_profile = -1): if model_filename2 != None: model_file_list += [model_filename2] model_type_list += [model_type] - module_type_list += [None] + module_type_list += [False] model_submodel_no_list += [2] for module_type in modules: - model_file_list.append(get_model_filename(model_type, transformer_quantization, transformer_dtype, module_type= module_type)) - model_type_list.append(model_type) - module_type_list.append(module_type) - model_submodel_no_list.append(0) + if isinstance(module_type,dict): + URLs1 = module_type.get("URLs", None) + if URLs1 is None: raise Exception(f"No URLs defined for Module {module_type}") + model_file_list.append(get_model_filename(model_type, transformer_quantization, transformer_dtype, URLs = URLs1)) + URLs2 = module_type.get("URLs2", None) + if URLs2 is None: raise Exception(f"No URL2s defined for Module {module_type}") + model_file_list.append(get_model_filename(model_type, transformer_quantization, transformer_dtype, URLs = URLs2)) + model_type_list += [model_type] * 2 + module_type_list += [True] * 2 + model_submodel_no_list += [1,2] + else: + model_file_list.append(get_model_filename(model_type, transformer_quantization, transformer_dtype, module_type= module_type)) + model_type_list.append(model_type) + module_type_list.append(True) + model_submodel_no_list.append(0) + local_model_file_list= [] for filename, file_model_type, file_module_type, submodel_no in zip(model_file_list, model_type_list, module_type_list, model_submodel_no_list): download_models(filename, file_model_type, file_module_type, submodel_no) + local_model_file_list.append( get_local_model_filename(filename) ) VAE_dtype = torch.float16 if server_config.get("vae_precision","16") == "16" else torch.float mixed_precision_transformer = server_config.get("mixed_precision","0") == "1" transformer_type = None - for submodel_no, filename in zip(model_submodel_no_list, model_file_list): - if submodel_no>=1: + for module_type, filename in zip(module_type_list, local_model_file_list): + if module_type is None: print(f"Loading Model '{filename}' ...") else: print(f"Loading Module '{filename}' ...") wan_model, pipe = model_types_handlers[base_model_type].load_model( - model_file_list, model_type, base_model_type, model_def, quantizeTransformer = quantizeTransformer, text_encoder_quantization = text_encoder_quantization, - dtype = transformer_dtype, VAE_dtype = VAE_dtype, mixed_precision_transformer = mixed_precision_transformer, save_quantized = save_quantized) + local_model_file_list, model_type, base_model_type, model_def, quantizeTransformer = quantizeTransformer, text_encoder_quantization = text_encoder_quantization, + dtype = transformer_dtype, VAE_dtype = VAE_dtype, mixed_precision_transformer = mixed_precision_transformer, save_quantized = save_quantized, submodel_no_list = model_submodel_no_list, ) kwargs = {} profile = init_pipe(pipe, kwargs, override_profile) @@ -2883,7 +2913,7 @@ def generate_header(model_type, compile, attention_mode): description_container = [""] get_model_name(model_type, description_container) - model_filename = get_model_filename(model_type, transformer_quantization, transformer_dtype_policy) or "" + model_filename = os.path.basename(get_model_filename(model_type, transformer_quantization, transformer_dtype_policy)) or "" description = description_container[0] header = f"
{description}
" overridden_attention = get_overridden_attention(model_type) @@ -3517,6 +3547,48 @@ def get_resampled_video(video_in, start_frame, max_frames, target_fps, bridge='t # print(f"frame nos: {frame_nos}") return frames_list +# def get_resampled_video(video_in, start_frame, max_frames, target_fps): +# from torchvision.io import VideoReader +# import torch +# from shared.utils.utils import resample + +# vr = VideoReader(video_in, "video") +# meta = vr.get_metadata()["video"] + +# fps = round(float(meta["fps"][0])) +# duration_s = float(meta["duration"][0]) +# num_src_frames = int(round(duration_s * fps)) # robust length estimate + +# if max_frames < 0: +# max_frames = max(int(num_src_frames / fps * target_fps + max_frames), 0) + +# frame_nos = resample( +# fps, num_src_frames, +# max_target_frames_count=max_frames, +# target_fps=target_fps, +# start_target_frame=start_frame +# ) +# if len(frame_nos) == 0: +# return torch.empty((0,)) # nothing to return + +# target_ts = [i / fps for i in frame_nos] + +# # Read forward once, grabbing frames when we pass each target timestamp +# frames = [] +# vr.seek(target_ts[0]) +# idx = 0 +# tol = 0.5 / fps # half-frame tolerance +# for frame in vr: +# t = float(frame["pts"]) # seconds +# if idx < len(target_ts) and t + tol >= target_ts[idx]: +# frames.append(frame["data"].permute(1,2,0)) # Tensor [H, W, C] +# idx += 1 +# if idx >= len(target_ts): +# break + +# return frames + + def get_preprocessor(process_type, inpaint_color): if process_type=="pose": from preprocessing.dwpose.pose import PoseBodyFaceVideoAnnotator @@ -3605,11 +3677,29 @@ def process_images_multithread(image_processor, items, process_type, wrap_in_lis return results -def preprocess_image_with_mask(input_image, input_mask, height, width, fit_canvas = False, block_size= 16, expand_scale = 2): +def preprocess_image_with_mask(input_image, input_mask, height, width, fit_canvas = False, fit_crop = False, block_size= 16, expand_scale = 2, outpainting_dims = None, inpaint_color = 127): frame_width, frame_height = input_image.size + if fit_crop: + input_image = rescale_and_crop(input_image, width, height) + if input_mask is not None: + input_mask = rescale_and_crop(input_mask, width, height) + return input_image, input_mask + + if outpainting_dims != None: + if fit_canvas != None: + frame_height, frame_width = get_outpainting_full_area_dimensions(frame_height,frame_width, outpainting_dims) + else: + frame_height, frame_width = height, width + if fit_canvas != None: height, width = calculate_new_dimensions(height, width, frame_height, frame_width, fit_into_canvas = fit_canvas, block_size = block_size) + + if outpainting_dims != None: + final_height, final_width = height, width + height, width, margin_top, margin_left = get_outpainting_frame_location(final_height, final_width, outpainting_dims, 1) + + if fit_canvas != None or outpainting_dims != None: input_image = input_image.resize((width, height), resample=Image.Resampling.LANCZOS) if input_mask is not None: input_mask = input_mask.resize((width, height), resample=Image.Resampling.LANCZOS) @@ -3621,10 +3711,23 @@ def preprocess_image_with_mask(input_image, input_mask, height, width, fit_canva input_mask = np.array(input_mask) input_mask = op_expand(input_mask, kernel, iterations=3) input_mask = Image.fromarray(input_mask) + + if outpainting_dims != None: + inpaint_color = inpaint_color / 127.5-1 + image = convert_image_to_tensor(input_image) + full_frame= torch.full( (image.shape[0], final_height, final_width), inpaint_color, dtype= torch.float, device= image.device) + full_frame[:, margin_top:margin_top+height, margin_left:margin_left+width] = image + input_image = convert_tensor_to_image(full_frame) + + if input_mask is not None: + mask = convert_image_to_tensor(input_mask) + full_frame= torch.full( (mask.shape[0], final_height, final_width), 1, dtype= torch.float, device= mask.device) + full_frame[:, margin_top:margin_top+height, margin_left:margin_left+width] = mask + input_mask = convert_tensor_to_image(full_frame) + return input_image, input_mask def preprocess_video_with_mask(input_video_path, input_mask_path, height, width, max_frames, start_frame=0, fit_canvas = None, fit_crop = False, target_fps = 16, block_size= 16, expand_scale = 2, process_type = "inpaint", process_type2 = None, to_bbox = False, RGB_Mask = False, negate_mask = False, process_outside_mask = None, inpaint_color = 127, outpainting_dims = None, proc_no = 1): - from shared.utils.utils import calculate_new_dimensions, get_outpainting_frame_location, get_outpainting_full_area_dimensions def mask_to_xyxy_box(mask): rows, cols = np.where(mask == 255) @@ -4592,6 +4695,7 @@ def generate_video( reuse_frames = min(sliding_window_size - 4, sliding_window_overlap) else: sliding_window = False + sliding_window_size = current_video_length reuse_frames = 0 _, latent_size = get_model_min_frames_and_step(model_type) @@ -4603,10 +4707,6 @@ def generate_video( # Source Video or Start Image > Control Video > Image Ref (background or positioned frames only) > UI Width, Height # Image Ref (non background and non positioned frames) are boxed in a white canvas in order to keep their own width/height ratio frames_to_inject = [] - if image_refs is not None: - frames_positions_list = [ int(pos)-1 for pos in frames_positions.split(" ")] if frames_positions is not None and len(frames_positions)> 0 else [] - frames_positions_list = frames_positions_list[:len(image_refs)] - nb_frames_positions = len(frames_positions_list) any_background_ref = 0 if "K" in video_prompt_type: any_background_ref = 2 if model_def.get("all_image_refs_are_background_ref", False) else 1 @@ -4642,29 +4742,47 @@ def generate_video( output_new_audio_data = None output_new_audio_filepath = None original_audio_guide = audio_guide + original_audio_guide2 = audio_guide2 audio_proj_split = None audio_proj_full = None audio_scale = None audio_context_lens = None if (fantasy or multitalk or hunyuan_avatar or hunyuan_custom_audio) and audio_guide != None: from models.wan.fantasytalking.infer import parse_audio + from preprocessing.extract_vocals import get_vocals import librosa duration = librosa.get_duration(path=audio_guide) combination_type = "add" + clean_audio_files = "V" in audio_prompt_type if audio_guide2 is not None: duration2 = librosa.get_duration(path=audio_guide2) if "C" in audio_prompt_type: duration += duration2 else: duration = min(duration, duration2) combination_type = "para" if "P" in audio_prompt_type else "add" + if clean_audio_files: + audio_guide = get_vocals(original_audio_guide, get_available_filename(save_path, audio_guide, "_clean", ".wav")) + audio_guide2 = get_vocals(original_audio_guide2, get_available_filename(save_path, audio_guide2, "_clean2", ".wav")) + temp_filenames_list += [audio_guide, audio_guide2] else: if "X" in audio_prompt_type: + # dual speaker, voice separation from preprocessing.speakers_separator import extract_dual_audio combination_type = "para" if args.save_speakers: audio_guide, audio_guide2 = "speaker1.wav", "speaker2.wav" else: audio_guide, audio_guide2 = get_available_filename(save_path, audio_guide, "_tmp1", ".wav"), get_available_filename(save_path, audio_guide, "_tmp2", ".wav") - extract_dual_audio(original_audio_guide, audio_guide, audio_guide2 ) + temp_filenames_list += [audio_guide, audio_guide2] + if clean_audio_files: + clean_audio_guide = get_vocals(original_audio_guide, get_available_filename(save_path, original_audio_guide, "_clean", ".wav")) + temp_filenames_list += [clean_audio_guide] + extract_dual_audio(clean_audio_guide if clean_audio_files else original_audio_guide, audio_guide, audio_guide2) + + elif clean_audio_files: + # Single Speaker + audio_guide = get_vocals(original_audio_guide, get_available_filename(save_path, audio_guide, "_clean", ".wav")) + temp_filenames_list += [audio_guide] + output_new_audio_filepath = original_audio_guide current_video_length = min(int(fps * duration //latent_size) * latent_size + latent_size + 1, current_video_length) @@ -4676,10 +4794,10 @@ def generate_video( # pad audio_proj_full if aligned to beginning of window to simulate source window overlap min_audio_duration = current_video_length/fps if reset_control_aligment else video_source_duration + current_video_length/fps audio_proj_full, output_new_audio_data = get_full_audio_embeddings(audio_guide1 = audio_guide, audio_guide2= audio_guide2, combination_type= combination_type , num_frames= max_source_video_frames, sr= audio_sampling_rate, fps =fps, padded_frames_for_embeddings = (reuse_frames if reset_control_aligment else 0), min_audio_duration = min_audio_duration) - if output_new_audio_data is not None: output_new_audio_filepath= None # need to build original speaker track if it changed size (due to padding at the end) or if it has been combined - if not args.save_speakers and "X" in audio_prompt_type: - os.remove(audio_guide) - os.remove(audio_guide2) + if output_new_audio_data is not None: # not none if modified + if clean_audio_files: # need to rebuild the sum of audios with original audio + _, output_new_audio_data = get_full_audio_embeddings(audio_guide1 = original_audio_guide, audio_guide2= original_audio_guide2, combination_type= combination_type , num_frames= max_source_video_frames, sr= audio_sampling_rate, fps =fps, padded_frames_for_embeddings = (reuse_frames if reset_control_aligment else 0), min_audio_duration = min_audio_duration, return_sum_only= True) + output_new_audio_filepath= None # need to build original speaker track if it changed size (due to padding at the end) or if it has been combined if hunyuan_custom_edit and video_guide != None: import cv2 @@ -4687,8 +4805,6 @@ def generate_video( length = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) current_video_length = min(current_video_length, length) - if image_guide is not None: - image_guide, image_mask = preprocess_image_with_mask(image_guide, image_mask, height, width, fit_canvas = None, block_size= block_size, expand_scale = mask_expand) seed = set_seed(seed) @@ -4727,7 +4843,7 @@ def generate_video( if repeat_no >= total_generation: break repeat_no +=1 gen["repeat_no"] = repeat_no - src_video, src_mask, src_ref_images = None, None, None + src_video = src_mask = src_ref_images = new_image_guide = new_image_mask = None prefix_video = pre_video_frame = None source_video_overlap_frames_count = 0 # number of frames overalapped in source video for first window source_video_frames_count = 0 # number of frames to use in source video (processing starts source_video_overlap_frames_count frames before ) @@ -4899,7 +5015,7 @@ def generate_video( elif "R" in video_prompt_type: # sparse video to video src_image = get_video_frame(video_guide, aligned_guide_start_frame, return_last_if_missing = True, return_PIL = True) - src_image, _, _ = calculate_dimensions_and_resize_image(src_image, new_height, new_width, sample_fit_canvas, fit_crop, block_size = block_size) + src_image, _, _ = calculate_dimensions_and_resize_image(src_image, image_size[0], image_size[1 ], sample_fit_canvas, fit_crop, block_size = block_size) refresh_preview["video_guide"] = src_image src_video = convert_image_to_tensor(src_image).unsqueeze(1) if sample_fit_canvas != None: @@ -4918,15 +5034,38 @@ def generate_video( if pre_video_guide != None: src_video = torch.cat( [pre_video_guide, src_video], dim=1) elif image_guide is not None: - image_guide, new_height, new_width = calculate_dimensions_and_resize_image(image_guide, height, width, sample_fit_canvas, fit_crop, block_size = block_size) - image_size = (new_height, new_width) - refresh_preview["image_guide"] = image_guide - sample_fit_canvas = None - if image_mask is not None: - image_mask, _, _ = calculate_dimensions_and_resize_image(image_mask, new_height, new_width, sample_fit_canvas, fit_crop, block_size = block_size) - refresh_preview["image_mask"] = image_mask + new_image_guide, new_image_mask = preprocess_image_with_mask(image_guide, image_mask, image_size[0], image_size[1], fit_canvas = sample_fit_canvas, fit_crop= fit_crop, block_size= block_size, expand_scale = mask_expand, outpainting_dims=outpainting_dims) + if sample_fit_canvas is not None: + image_size = (new_image_guide.size[1], new_image_guide.size[0]) + sample_fit_canvas = None + refresh_preview["image_guide"] = new_image_guide + if new_image_mask is not None: + refresh_preview["image_mask"] = new_image_mask if window_no == 1 and image_refs is not None and len(image_refs) > 0: + if repeat_no == 1: + frames_positions_list = [] + if frames_positions is not None and len(frames_positions)> 0: + positions = frames_positions.split(" ") + cur_end_pos = -1 + (source_video_frames_count - source_video_overlap_frames_count) #if reset_control_aligment else 0 + last_frame_no = requested_frames_to_generate + source_video_frames_count - source_video_overlap_frames_count + joker_used = False + project_window_no = 1 + for pos in positions : + if len(pos) > 0: + if pos in ["L", "l"]: + cur_end_pos += sliding_window_size if project_window_no > 1 else current_video_length + if cur_end_pos >= last_frame_no and not joker_used: + joker_used = True + cur_end_pos = last_frame_no -1 + project_window_no += 1 + frames_positions_list.append(cur_end_pos) + cur_end_pos -= sliding_window_discard_last_frames + reuse_frames + else: + frames_positions_list.append(int(pos)-1 + alignment_shift) + frames_positions_list = frames_positions_list[:len(image_refs)] + nb_frames_positions = len(frames_positions_list) + if sample_fit_canvas is not None and (nb_frames_positions > 0 or "K" in video_prompt_type) : from shared.utils.utils import get_outpainting_full_area_dimensions w, h = image_refs[0].size @@ -4963,7 +5102,7 @@ def generate_video( frames_to_inject[pos] = image_refs[i] if vace : - frames_to_inject_parsed = frames_to_inject[aligned_guide_start_frame: aligned_guide_end_frame] + frames_to_inject_parsed = frames_to_inject[guide_start_frame: guide_end_frame] image_refs_copy = image_refs[nb_frames_positions:].copy() if image_refs != None and len(image_refs) > nb_frames_positions else None # required since prepare_source do inplace modifications src_video, src_mask, src_ref_images = wan_model.prepare_source([video_guide_processed] if video_guide_processed2 == None else [video_guide_processed, video_guide_processed2], @@ -4971,14 +5110,13 @@ def generate_video( [image_refs_copy] if video_guide_processed2 == None else [image_refs_copy, image_refs_copy], current_video_length, image_size = image_size, device ="cpu", keep_video_guide_frames=keep_frames_parsed, - start_frame = aligned_guide_start_frame, pre_src_video = [pre_video_guide] if video_guide_processed2 == None else [pre_video_guide, pre_video_guide], inject_frames= frames_to_inject_parsed, outpainting_dims = outpainting_dims, any_background_ref = any_background_ref ) if len(frames_to_inject_parsed) or any_background_ref: - new_image_refs = [convert_tensor_to_image(src_video[0], frame_no) for frame_no, inject in enumerate(frames_to_inject_parsed) if inject] + new_image_refs = [convert_tensor_to_image(src_video[0], frame_no + aligned_guide_start_frame - aligned_window_start_frame) for frame_no, inject in enumerate(frames_to_inject_parsed) if inject] if any_background_ref: new_image_refs += [convert_tensor_to_image(image_refs_copy[0], 0)] + image_refs[nb_frames_positions+1:] else: @@ -5094,8 +5232,9 @@ def generate_video( pre_video_frame = pre_video_frame, original_input_ref_images = original_image_refs[nb_frames_positions:] if original_image_refs is not None else [], image_refs_relative_size = image_refs_relative_size, - image_guide= image_guide, - image_mask= image_mask, + image_guide= new_image_guide, + image_mask= new_image_mask, + outpainting_dims = outpainting_dims, ) except Exception as e: if len(control_audio_tracks) > 0 or len(source_audio_tracks) > 0: @@ -6755,10 +6894,17 @@ def refresh_audio_prompt_type_remux(state, audio_prompt_type, remux): audio_prompt_type = add_to_sequence(audio_prompt_type, remux) return audio_prompt_type +def refresh_remove_background_sound(state, audio_prompt_type, remove_background_sound): + audio_prompt_type = del_in_sequence(audio_prompt_type, "V") + if remove_background_sound: + audio_prompt_type = add_to_sequence(audio_prompt_type, "V") + return audio_prompt_type + + def refresh_audio_prompt_type_sources(state, audio_prompt_type, audio_prompt_type_sources): audio_prompt_type = del_in_sequence(audio_prompt_type, "XCPAB") audio_prompt_type = add_to_sequence(audio_prompt_type, audio_prompt_type_sources) - return audio_prompt_type, gr.update(visible = "A" in audio_prompt_type), gr.update(visible = "B" in audio_prompt_type), gr.update(visible = ("B" in audio_prompt_type or "X" in audio_prompt_type)) + return audio_prompt_type, gr.update(visible = "A" in audio_prompt_type), gr.update(visible = "B" in audio_prompt_type), gr.update(visible = ("B" in audio_prompt_type or "X" in audio_prompt_type)), gr.update(visible= any_letters(audio_prompt_type, "ABX")) def refresh_image_prompt_type_radio(state, image_prompt_type, image_prompt_type_radio): image_prompt_type = del_in_sequence(image_prompt_type, "VLTS") @@ -6775,7 +6921,7 @@ def refresh_image_prompt_type_endcheckbox(state, image_prompt_type, image_prompt image_prompt_type = add_to_sequence(image_prompt_type, image_prompt_type_radio) return image_prompt_type, gr.update(visible = "E" in image_prompt_type ) -def refresh_video_prompt_type_image_refs(state, video_prompt_type, video_prompt_type_image_refs): +def refresh_video_prompt_type_image_refs(state, video_prompt_type, video_prompt_type_image_refs, image_mode): model_type = state["model_type"] model_def = get_model_def(model_type) image_ref_choices = model_def.get("image_ref_choices", None) @@ -6785,11 +6931,10 @@ def refresh_video_prompt_type_image_refs(state, video_prompt_type, video_prompt_ video_prompt_type = del_in_sequence(video_prompt_type, "KFI") video_prompt_type = add_to_sequence(video_prompt_type, video_prompt_type_image_refs) visible = "I" in video_prompt_type - vace= test_vace_module(state["model_type"]) - + any_outpainting= image_mode in model_def.get("video_guide_outpainting", []) rm_bg_visible= visible and not model_def.get("no_background_removal", False) img_rel_size_visible = visible and model_def.get("any_image_refs_relative_size", False) - return video_prompt_type, gr.update(visible = visible),gr.update(visible = rm_bg_visible), gr.update(visible = img_rel_size_visible), gr.update(visible = visible and "F" in video_prompt_type_image_refs), gr.update(visible= ("F" in video_prompt_type_image_refs or "K" in video_prompt_type_image_refs or "V" in video_prompt_type) and vace ) + return video_prompt_type, gr.update(visible = visible),gr.update(visible = rm_bg_visible), gr.update(visible = img_rel_size_visible), gr.update(visible = visible and "F" in video_prompt_type_image_refs), gr.update(visible= ("F" in video_prompt_type_image_refs or "K" in video_prompt_type_image_refs or "V" in video_prompt_type) and any_outpainting ) def switch_image_guide_editor(image_mode, old_video_prompt_type , video_prompt_type, old_image_mask_guide_value, old_image_guide_value, old_image_mask_value ): if image_mode == 0: return gr.update(visible=False), gr.update(visible=False), gr.update(visible=False) @@ -6840,13 +6985,13 @@ def refresh_video_prompt_type_video_guide(state, video_prompt_type, video_prompt visible = "V" in video_prompt_type model_type = state["model_type"] model_def = get_model_def(model_type) - vace= test_vace_module(model_type) + any_outpainting= image_mode in model_def.get("video_guide_outpainting", []) mask_visible = visible and "A" in video_prompt_type and not "U" in video_prompt_type image_outputs = image_mode > 0 keep_frames_video_guide_visible = not image_outputs and visible and not model_def.get("keep_frames_video_guide_not_supported", False) image_mask_guide, image_guide, image_mask = switch_image_guide_editor(image_mode, old_video_prompt_type , video_prompt_type, old_image_mask_guide_value, old_image_guide_value, old_image_mask_value ) - return video_prompt_type, gr.update(visible = visible and not image_outputs), image_guide, gr.update(visible = keep_frames_video_guide_visible), gr.update(visible = visible and "G" in video_prompt_type), gr.update(visible= (visible or "F" in video_prompt_type or "K" in video_prompt_type) and vace), gr.update(visible= visible and not "U" in video_prompt_type ), gr.update(visible= mask_visible and not image_outputs), image_mask, image_mask_guide, gr.update(visible= mask_visible) + return video_prompt_type, gr.update(visible = visible and not image_outputs), image_guide, gr.update(visible = keep_frames_video_guide_visible), gr.update(visible = visible and "G" in video_prompt_type), gr.update(visible= (visible or "F" in video_prompt_type or "K" in video_prompt_type) and any_outpainting), gr.update(visible= visible and not "U" in video_prompt_type ), gr.update(visible= mask_visible and not image_outputs), image_mask, image_mask_guide, gr.update(visible= mask_visible) def refresh_video_prompt_type_video_guide_alt(state, video_prompt_type, video_prompt_type_video_guide_alt, image_mode): @@ -7112,6 +7257,12 @@ def refresh_video_length_label(state, current_video_length, force_fps, video_gui computed_fps = get_computed_fps(force_fps, base_model_type , video_guide, video_source ) return gr.update(label= compute_video_length_label(computed_fps, current_video_length)) +def get_default_value(choices, current_value, default_value = None): + for label, value in choices: + if value == current_value: + return current_value + return default_value + def generate_video_tab(update_form = False, state_dict = None, ui_defaults = None, model_family = None, model_choice = None, header = None, main = None, main_tabs= None): global inputs_names #, advanced @@ -7277,7 +7428,10 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non image_prompt_types_allowed = model_def.get("image_prompt_types_allowed", "") model_mode_choices = model_def.get("model_modes", None) - with gr.Column(visible= image_mode_value == 0 and (len(image_prompt_types_allowed)> 0 or model_mode_choices is not None)) as image_prompt_column: + model_modes_visibility = [0,1,2] + if model_mode_choices is not None: model_modes_visibility= model_mode_choices.get("image_modes", model_modes_visibility) + + with gr.Column(visible= image_mode_value == 0 and len(image_prompt_types_allowed)> 0 or model_mode_choices is not None and image_mode_value in model_modes_visibility ) as image_prompt_column: # Video Continue / Start Frame / End Frame image_prompt_type_value= ui_defaults.get("image_prompt_type","") image_prompt_type = gr.Text(value= image_prompt_type_value, visible= False) @@ -7293,7 +7447,7 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non if "L" in image_prompt_types_allowed: any_video_source = True image_prompt_type_choices += [("Continue Last Video", "L")] - with gr.Group(visible= len(image_prompt_types_allowed)>1) as image_prompt_type_group: + with gr.Group(visible= len(image_prompt_types_allowed)>1 and image_mode_value == 0) as image_prompt_type_group: with gr.Row(): image_prompt_type_radio_allowed_values= filter_letters(image_prompt_types_allowed, "SVL") image_prompt_type_radio_value = filter_letters(image_prompt_type_value, image_prompt_type_radio_allowed_values, image_prompt_type_choices[0][1] if len(image_prompt_type_choices) > 0 else "") @@ -7309,10 +7463,11 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non image_start_row, image_start, image_start_extra = get_image_gallery(label= "Images as starting points for new Videos in the Generation Queue", value = ui_defaults.get("image_start", None), visible= "S" in image_prompt_type_value ) video_source = gr.Video(label= "Video to Continue", height = gallery_height, visible= "V" in image_prompt_type_value, value= ui_defaults.get("video_source", None),) image_end_row, image_end, image_end_extra = get_image_gallery(label= get_image_end_label(ui_defaults.get("multi_prompts_gen_type", 0)), value = ui_defaults.get("image_end", None), visible= any_letters(image_prompt_type_value, "SVL") and ("E" in image_prompt_type_value) ) - if model_mode_choices is None: + if model_mode_choices is None or image_mode_value not in model_modes_visibility: model_mode = gr.Dropdown(value=None, visible=False) else: - model_mode = gr.Dropdown(choices=model_mode_choices["choices"], value=ui_defaults.get("model_mode", model_mode_choices["default"]), label=model_mode_choices["label"], visible=True) + model_mode_value = get_default_value(model_mode_choices["choices"], ui_defaults.get("model_mode", None), model_mode_choices["default"] ) + model_mode = gr.Dropdown(choices=model_mode_choices["choices"], value=model_mode_value, label=model_mode_choices["label"], visible=True) keep_frames_video_source = gr.Text(value=ui_defaults.get("keep_frames_video_source","") , visible= len(filter_letters(image_prompt_type_value, "VL"))>0 , scale = 2, label= "Truncate Video beyond this number of resampled Frames (empty=Keep All, negative truncates from End)" ) any_control_video = any_control_image = False @@ -7374,9 +7529,11 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non video_prompt_type_video_guide_alt_label = guide_custom_choices.get("label", "Control Video Process") if image_outputs: video_prompt_type_video_guide_alt_label = video_prompt_type_video_guide_alt_label.replace("Video", "Image") video_prompt_type_video_guide_alt_choices = [(label.replace("Video", "Image") if image_outputs else label, value) for label,value in guide_custom_choices["choices"] ] + guide_custom_choices_value = get_default_value(video_prompt_type_video_guide_alt_choices, filter_letters(video_prompt_type_value, guide_custom_choices["letters_filter"]), guide_custom_choices.get("default", "") ) video_prompt_type_video_guide_alt = gr.Dropdown( choices= video_prompt_type_video_guide_alt_choices, - value=filter_letters(video_prompt_type_value, guide_custom_choices["letters_filter"], guide_custom_choices.get("default", "") ), + # value=filter_letters(video_prompt_type_value, guide_custom_choices["letters_filter"], guide_custom_choices.get("default", "") ), + value=guide_custom_choices_value, visible = guide_custom_choices.get("visible", True), label= video_prompt_type_video_guide_alt_label, show_label= guide_custom_choices.get("show_label", True), scale = 2 ) @@ -7437,7 +7594,7 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non label=image_ref_choices.get("label", "Ref. Images Type"), show_label= True, scale = 2 ) - image_guide = gr.Image(label= "Control Image", height = 800, width=800, type ="pil", visible= image_mode_value==1 and "V" in video_prompt_type_value and ("U" in video_prompt_type_value or not "A" in video_prompt_type_value ) , value= ui_defaults.get("image_guide", None)) + image_guide = gr.Image(label= "Control Image", height = 800, type ="pil", visible= image_mode_value==1 and "V" in video_prompt_type_value and ("U" in video_prompt_type_value or not "A" in video_prompt_type_value ) , value= ui_defaults.get("image_guide", None)) video_guide = gr.Video(label= "Control Video", height = gallery_height, visible= (not image_outputs) and "V" in video_prompt_type_value, value= ui_defaults.get("video_guide", None)) if image_mode_value >= 1: image_guide_value = ui_defaults.get("image_guide", None) @@ -7457,7 +7614,7 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non layers=False, brush=gr.Brush(colors=["#FFFFFF"], color_mode="fixed"), # fixed_canvas= True, - width=800, + # width=800, height=800, # transforms=None, # interactive=True, @@ -7472,12 +7629,12 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non denoising_strength = gr.Slider(0, 1, value= ui_defaults.get("denoising_strength" ,0.5), step=0.01, label=f"Denoising Strength (the Lower the Closer to the Control {'Image' if image_outputs else 'Video'})", visible = "G" in video_prompt_type_value, show_reset_button= False) keep_frames_video_guide_visible = not image_outputs and "V" in video_prompt_type_value and not model_def.get("keep_frames_video_guide_not_supported", False) keep_frames_video_guide = gr.Text(value=ui_defaults.get("keep_frames_video_guide","") , visible= keep_frames_video_guide_visible , scale = 2, label= "Frames to keep in Control Video (empty=All, 1=first, a:b for a range, space to separate values)" ) #, -1=last - - with gr.Column(visible= ("V" in video_prompt_type_value or "K" in video_prompt_type_value or "F" in video_prompt_type_value) and vace) as video_guide_outpainting_col: + video_guide_outpainting_modes = model_def.get("video_guide_outpainting", []) + with gr.Column(visible= ("V" in video_prompt_type_value or "K" in video_prompt_type_value or "F" in video_prompt_type_value) and image_mode_value in video_guide_outpainting_modes) as video_guide_outpainting_col: video_guide_outpainting_value = ui_defaults.get("video_guide_outpainting","#") video_guide_outpainting = gr.Text(value=video_guide_outpainting_value , visible= False) with gr.Group(): - video_guide_outpainting_checkbox = gr.Checkbox(label="Enable Spatial Outpainting on Control Video, Landscape or Injected Reference Frames", value=len(video_guide_outpainting_value)>0 and not video_guide_outpainting_value.startswith("#") ) + video_guide_outpainting_checkbox = gr.Checkbox(label="Enable Spatial Outpainting on Control Video, Landscape or Injected Reference Frames" if image_mode_value == 0 else "Enable Spatial Outpainting on Control Image", value=len(video_guide_outpainting_value)>0 and not video_guide_outpainting_value.startswith("#") ) with gr.Row(visible = not video_guide_outpainting_value.startswith("#")) as video_guide_outpainting_row: video_guide_outpainting_value = video_guide_outpainting_value[1:] if video_guide_outpainting_value.startswith("#") else video_guide_outpainting_value video_guide_outpainting_list = [0] * 4 if len(video_guide_outpainting_value) == 0 else [int(v) for v in video_guide_outpainting_value.split(" ")] @@ -7495,7 +7652,7 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non image_refs_label = "Start Image" if hunyuan_video_avatar else ("Reference Image" if image_refs_single_image_mode else "Reference Images") + (" (each Image will start a new Clip)" if infinitetalk else "") image_refs_row, image_refs, image_refs_extra = get_image_gallery(label= image_refs_label, value = ui_defaults.get("image_refs", None), visible= "I" in video_prompt_type_value, single_image_mode=image_refs_single_image_mode) - frames_positions = gr.Text(value=ui_defaults.get("frames_positions","") , visible= "F" in video_prompt_type_value, scale = 2, label= "Positions of Injected Frames separated by Spaces (1=first, no position for Objects / People)" ) + frames_positions = gr.Text(value=ui_defaults.get("frames_positions","") , visible= "F" in video_prompt_type_value, scale = 2, label= "Positions of Injected Frames (1=first, L=last of a window) no position for other Image Refs)" ) image_refs_relative_size = gr.Slider(20, 100, value=ui_defaults.get("image_refs_relative_size", 50), step=1, label="Rescale Internaly Image Ref (% in relation to Output Video) to change Output Composition", visible = model_def.get("any_image_refs_relative_size", False) and image_outputs) no_background_removal = model_def.get("no_background_removal", False) or image_ref_choices is None @@ -7522,7 +7679,7 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non speaker_choices=[("None", "")] if any_single_speaker: speaker_choices += [("One Person Speaking Only", "A")] if any_multi_speakers:speaker_choices += [ - ("Two speakers, Auto Separation of Speakers (will work only if there is little background noise)", "XA"), + ("Two speakers, Auto Separation of Speakers (will work only if Voices are distinct)", "XA"), ("Two speakers, Speakers Audio sources are assumed to be played in a Row", "CAB"), ("Two speakers, Speakers Audio sources are assumed to be played in Parallel", "PAB") ] @@ -7537,6 +7694,7 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non with gr.Row(visible = any_audio_voices_support and not image_outputs) as audio_guide_row: audio_guide = gr.Audio(value= ui_defaults.get("audio_guide", None), type="filepath", label="Voice to follow", show_download_button= True, visible= any_audio_voices_support and "A" in audio_prompt_type_value ) audio_guide2 = gr.Audio(value= ui_defaults.get("audio_guide2", None), type="filepath", label="Voice to follow #2", show_download_button= True, visible= any_audio_voices_support and "B" in audio_prompt_type_value ) + remove_background_sound = gr.Checkbox(label="Video Motion ignores Background Music (to get a better LipSync)", value="V" in audio_prompt_type_value, visible = any_audio_voices_support and any_letters(audio_prompt_type_value, "ABX") and not image_outputs) with gr.Row(visible = any_audio_voices_support and ("B" in audio_prompt_type_value or "X" in audio_prompt_type_value) and not image_outputs ) as speakers_locations_row: speakers_locations = gr.Text( ui_defaults.get("speakers_locations", "0:45 55:100"), label="Speakers Locations separated by a Space. Each Location = Left:Right or a BBox Left:Top:Right:Bottom", visible= True) @@ -7916,7 +8074,7 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non sliding_window_defaults = model_def.get("sliding_window_defaults", {}) sliding_window_size = gr.Slider(5, get_max_frames(257), value=ui_defaults.get("sliding_window_size", 129), step=4, label="Sliding Window Size") sliding_window_overlap = gr.Slider(sliding_window_defaults.get("overlap_min", 1), sliding_window_defaults.get("overlap_max", 97), value=ui_defaults.get("sliding_window_overlap",sliding_window_defaults.get("overlap_default", 5)), step=sliding_window_defaults.get("overlap_step", 4), label="Windows Frames Overlap (needed to maintain continuity between windows, a higher value will require more windows)") - sliding_window_color_correction_strength = gr.Slider(0, 1, value=ui_defaults.get("sliding_window_color_correction_strength",1), step=0.01, label="Color Correction Strength (match colors of new window with previous one, 0 = disabled)", visible = True) + sliding_window_color_correction_strength = gr.Slider(0, 1, value=ui_defaults.get("sliding_window_color_correction_strength",0), step=0.01, label="Color Correction Strength (match colors of new window with previous one, 0 = disabled)", visible = True) sliding_window_overlap_noise = gr.Slider(0, 150, value=ui_defaults.get("sliding_window_overlap_noise",20 if vace else 0), step=1, label="Noise to be added to overlapped frames to reduce blur effect" , visible = vace) sliding_window_discard_last_frames = gr.Slider(0, 20, value=ui_defaults.get("sliding_window_discard_last_frames", 0), step=4, label="Discard Last Frames of a Window (that may have bad quality)", visible = True) @@ -7926,7 +8084,7 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non ("Aligned to the beginning of the First Window of the new Video Sample", "T"), ], value=filter_letters(video_prompt_type_value, "T"), - label="Control Video / Control Audio temporal alignment when any Source Video", + label="Control Video / Injected Frames / Control Audio temporal alignment when any Video to continue", visible = vace or ltxv or t2v or infinitetalk ) @@ -8112,7 +8270,7 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non video_guide_outpainting_checkbox, video_guide_outpainting_row, show_advanced, video_info_to_control_video_btn, video_info_to_video_source_btn, sample_solver_row, video_buttons_row, image_buttons_row, video_postprocessing_tab, audio_remuxing_tab, PP_MMAudio_row, PP_custom_audio_row, video_info_to_start_image_btn, video_info_to_end_image_btn, video_info_to_reference_image_btn, video_info_to_image_guide_btn, video_info_to_image_mask_btn, - NAG_col, speakers_locations_row, embedded_guidance_row, guidance_phases_row, guidance_row, resolution_group, cfg_free_guidance_col, control_net_weights_row, guide_selection_row, image_mode_tabs, + NAG_col, remove_background_sound , speakers_locations_row, embedded_guidance_row, guidance_phases_row, guidance_row, resolution_group, cfg_free_guidance_col, control_net_weights_row, guide_selection_row, image_mode_tabs, min_frames_if_references_col, video_prompt_type_alignment, prompt_enhancer_btn, tab_inpaint, tab_t2v] + image_start_extra + image_end_extra + image_refs_extra # presets_column, if update_form: locals_dict = locals() @@ -8131,11 +8289,11 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non gr.on(triggers=[video_length.release, force_fps.change, video_guide.change, video_source.change], fn=refresh_video_length_label, inputs=[state, video_length, force_fps, video_guide, video_source] , outputs = video_length, trigger_mode="always_last", show_progress="hidden" ) guidance_phases.change(fn=change_guidance_phases, inputs= [state, guidance_phases], outputs =[model_switch_phase, guidance_phases_row, switch_threshold, switch_threshold2, guidance2_scale, guidance3_scale ]) audio_prompt_type_remux.change(fn=refresh_audio_prompt_type_remux, inputs=[state, audio_prompt_type, audio_prompt_type_remux], outputs=[audio_prompt_type]) - audio_prompt_type_sources.change(fn=refresh_audio_prompt_type_sources, inputs=[state, audio_prompt_type, audio_prompt_type_sources], outputs=[audio_prompt_type, audio_guide, audio_guide2, speakers_locations_row]) + remove_background_sound.change(fn=refresh_remove_background_sound, inputs=[state, audio_prompt_type, remove_background_sound], outputs=[audio_prompt_type]) + audio_prompt_type_sources.change(fn=refresh_audio_prompt_type_sources, inputs=[state, audio_prompt_type, audio_prompt_type_sources], outputs=[audio_prompt_type, audio_guide, audio_guide2, speakers_locations_row, remove_background_sound]) image_prompt_type_radio.change(fn=refresh_image_prompt_type_radio, inputs=[state, image_prompt_type, image_prompt_type_radio], outputs=[image_prompt_type, image_start_row, image_end_row, video_source, keep_frames_video_source, image_prompt_type_endcheckbox], show_progress="hidden" ) image_prompt_type_endcheckbox.change(fn=refresh_image_prompt_type_endcheckbox, inputs=[state, image_prompt_type, image_prompt_type_radio, image_prompt_type_endcheckbox], outputs=[image_prompt_type, image_end_row] ) - # video_prompt_video_guide_trigger.change(fn=refresh_video_prompt_video_guide_trigger, inputs=[state, video_prompt_type, video_prompt_video_guide_trigger], outputs=[video_prompt_type, video_prompt_type_video_guide, video_guide, keep_frames_video_guide, denoising_strength, video_guide_outpainting_col, video_prompt_type_video_mask, video_mask, mask_expand]) - video_prompt_type_image_refs.input(fn=refresh_video_prompt_type_image_refs, inputs = [state, video_prompt_type, video_prompt_type_image_refs], outputs = [video_prompt_type, image_refs_row, remove_background_images_ref, image_refs_relative_size, frames_positions,video_guide_outpainting_col], show_progress="hidden") + video_prompt_type_image_refs.input(fn=refresh_video_prompt_type_image_refs, inputs = [state, video_prompt_type, video_prompt_type_image_refs,image_mode], outputs = [video_prompt_type, image_refs_row, remove_background_images_ref, image_refs_relative_size, frames_positions,video_guide_outpainting_col], show_progress="hidden") video_prompt_type_video_guide.input(fn=refresh_video_prompt_type_video_guide, inputs = [state, video_prompt_type, video_prompt_type_video_guide, image_mode, image_mask_guide, image_guide, image_mask], outputs = [video_prompt_type, video_guide, image_guide, keep_frames_video_guide, denoising_strength, video_guide_outpainting_col, video_prompt_type_video_mask, video_mask, image_mask, image_mask_guide, mask_expand], show_progress="hidden") video_prompt_type_video_guide_alt.input(fn=refresh_video_prompt_type_video_guide_alt, inputs = [state, video_prompt_type, video_prompt_type_video_guide_alt, image_mode], outputs = [video_prompt_type, video_guide, image_guide, image_refs_row, denoising_strength ], show_progress="hidden") video_prompt_type_video_mask.input(fn=refresh_video_prompt_type_video_mask, inputs = [state, video_prompt_type, video_prompt_type_video_mask, image_mode, image_mask_guide, image_guide, image_mask], outputs = [video_prompt_type, video_mask, image_mask_guide, image_guide, image_mask, mask_expand], show_progress="hidden") @@ -8338,7 +8496,7 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non outputs= None ).then(fn=process_prompt_and_add_tasks, inputs = [state, model_choice], - outputs= queue_df, + outputs= [queue_df, queue_accordion], show_progress="hidden", ).then(fn=prepare_generate_video, inputs= [state], @@ -8346,11 +8504,6 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non ).then(fn=activate_status, inputs= [state], outputs= [status_trigger], - ).then( - fn=lambda s: gr.Accordion(open=True) if len(get_gen_info(s).get("queue", [])) > 1 else gr.update(), - inputs=[state], - outputs=[queue_accordion], - show_progress="hidden", ).then(fn=process_tasks, inputs= [state], outputs= [preview_trigger, output_trigger], @@ -8468,12 +8621,7 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non outputs= None ).then(fn=process_prompt_and_add_tasks, inputs = [state, model_choice], - outputs=queue_df, - show_progress="hidden", - ).then( - fn=lambda s: gr.Accordion(open=True) if len(get_gen_info(s).get("queue", [])) > 1 else gr.update(), - inputs=[state], - outputs=[queue_accordion], + outputs=[queue_df, queue_accordion], show_progress="hidden", ).then( fn=update_status,