Merge branch 'main' into feature_add-cuda-docker-runner
3
.gitignore
vendored
@ -14,7 +14,7 @@
|
||||
*.pth
|
||||
*.ckpt
|
||||
*.safetensors
|
||||
*.json
|
||||
#*.json
|
||||
# *.txt
|
||||
*.backup
|
||||
*.pkl
|
||||
@ -36,6 +36,7 @@ Wan2.1-T2V-1.3B/
|
||||
Wan2.1-I2V-14B-480P/
|
||||
Wan2.1-I2V-14B-720P/
|
||||
outputs/
|
||||
outputs2/
|
||||
gradio_outputs/
|
||||
ckpts/
|
||||
loras/
|
||||
|
||||
53
LICENSE.txt
@ -1,17 +1,46 @@
|
||||
FREE for Non Commercial USE
|
||||
WanGP NON-COMMERCIAL EVALUATION LICENSE 1.0
|
||||
|
||||
You are free to:
|
||||
- Share — copy and redistribute the material in any medium or format
|
||||
- Adapt — remix, transform, and build upon the material
|
||||
The licensor cannot revoke these freedoms as long as you follow the license terms.
|
||||
Definitions
|
||||
1.1 “Software” means the source code, binaries, libraries, utilities and UI released under this license.
|
||||
1.2 “Output” means images, videos or other media produced by running the Software.
|
||||
1.3 “Commercial Use” means:
|
||||
a) selling, sublicensing, renting, leasing, or otherwise distributing the Software, in whole or in part, for a fee or other consideration; or
|
||||
b) offering the Software (or any derivative) as part of a paid product or hosted service; or
|
||||
c) using the Software (or any derivative) to provide cloud-based or backend services, where end users access or pay for those services.
|
||||
|
||||
Under the following terms:
|
||||
- Attribution — You must give appropriate credit , provide a link to the license, and indicate if changes were made . You may do so in any reasonable manner, but not in any way that suggests the licensor endorses you or your use.
|
||||
NonCommercial — You may not use the material for commercial purposes .
|
||||
License Grant
|
||||
Subject to Section 3:
|
||||
a) You are granted a worldwide, non-exclusive, royalty-free, revocable license to use, reproduce, modify and distribute the Software for non-commercial purposes only.
|
||||
b) You are granted a worldwide, non-exclusive, royalty-free, irrevocable license to use, reproduce, modify and distribute the Output for any purpose, including commercial sale, provided that any commercial distribution of the Output includes a clear notice that the Output was produced (in whole or in part) using WanGP, along with a hyperlink to the WanGP application’s About tab or repository.
|
||||
|
||||
- No additional restrictions — You may not apply legal terms or technological measures that legally restrict others from doing anything the license permits.
|
||||
Notices:
|
||||
Restrictions
|
||||
3.1 You MAY NOT distribute, sublicense or otherwise make available the Software (or any derivative) for Commercial Use.
|
||||
3.2 You MAY sell, license or otherwise commercially exploit the Output without restriction.
|
||||
3.3 If you wish to use the Software for Commercial Use, you must obtain a separate commercial license from the Licensor.
|
||||
|
||||
- You do not have to comply with the license for elements of the material in the public domain or where your use is permitted by an applicable exception or limitation .
|
||||
Third-Party Components 4.1 The Software includes components licensed under various open-source licenses (e.g., Apache 2.0, MIT, BSD). 4.2 You must comply with all applicable terms of those third-party licenses, including preservation of copyright notices, inclusion of required license texts, and patent-grant provisions. 4.3 You can find the full text of each third-party license via the “About” tab in the WanGP application, which provides links to their original GitHub repositories.
|
||||
|
||||
Attribution
|
||||
5.1 You must give appropriate credit by including:
|
||||
• a copy of this license (or a link to it), and
|
||||
• a notice that your use is based on “WanGP”.
|
||||
5.2 You may do so in any reasonable manner, but not in any way that suggests the Licensor endorses you or your use.
|
||||
|
||||
Disclaimer of Warranty & Liability
|
||||
THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED.
|
||||
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE.
|
||||
|
||||
Commercial Licensing The Licensor may offer commercial licenses for the Software, which grant rights to use the Software for Commercial Use. Please contact [deepbeepmeep@yahoo.com] for terms and pricing.
|
||||
|
||||
Effective Date & Previous Versions
|
||||
8.1 This license is effective as of the date the LICENSE file is updated in the WanGP repository.
|
||||
8.2 Any copies of the Software obtained under prior license terms before this Effective Date remain governed by those prior terms; such granted rights are irrevocable.
|
||||
8.3 Use of the Software after the release of any subsequent version by the Licensor is subject to the terms of the then-current license, unless a separate agreement is in place.
|
||||
|
||||
Acceptable Use / Moral Clause
|
||||
9.1 You MAY NOT use the Software or the Output to facilitate or produce content that is illegal, harmful, violent, harassing, defamatory, fraudulent, or otherwise violates applicable laws or fundamental human rights.
|
||||
9.2 You MAY NOT deploy the Software or Output in contexts that promote hate speech, extremist ideology, human rights abuses, or other actions that could foreseeably cause significant harm to individuals or groups.
|
||||
9.3 The Licensor reserves the right to terminate the rights granted under this license if a licensee materially breaches this Acceptable Use clause.
|
||||
|
||||
END OF LICENSE
|
||||
|
||||
No warranties are given. The license may not give you all of the permissions necessary for your intended use. For example, other rights such as publicity, privacy, or moral rights may limit how you use the material.
|
||||
239
README.md
@ -19,7 +19,157 @@ WanGP supports the Wan (and derived models), Hunyuan Video and LTV Video models
|
||||
|
||||
**Follow DeepBeepMeep on Twitter/X to get the Latest News**: https://x.com/deepbeepmeep
|
||||
|
||||
## 🔥 Latest Updates
|
||||
## 🔥 Latest Updates :
|
||||
### August 29 2025: WanGP v8.2 - Here Goes Your Weekend
|
||||
|
||||
- **InfiniteTalk Video to Video**: this feature can be used for Video Dubbing. Keep in mind that it is a *Sparse Video to Video*, that is internally only image is used by Sliding Window. However thanks to the new *Smooth Transition* mode, each new clip is connected to the previous and all the camera work is done by InfiniteTalk. If you dont get any transition, increase the number of frames of a Sliding Window (81 frames recommended)
|
||||
|
||||
- **StandIn**: very light model specialized in Identity Transfer. I have provided two versions of Standin: a basic one derived from the text 2 video model and another based on Vace. If used with Vace, the last reference frame given to Vace will be also used for StandIn
|
||||
|
||||
- **Flux ESO**: a new Flux dervied *Image Editing tool*, but this one is specialized both in *Identity Transfer* and *Style Transfer*. Style has to be understood in its wide meaning: give a reference picture of a person and another one of Sushis and you will turn this person into Sushis
|
||||
|
||||
### August 24 2025: WanGP v8.1 - the RAM Liberator
|
||||
|
||||
- **Reserved RAM entirely freed when switching models**, you should get much less out of memory related to RAM. I have also added a button in *Configuration / Performance* that will release most of the RAM used by WanGP if you want to use another application without quitting WanGP
|
||||
- **InfiniteTalk** support: improved version of Multitalk that supposedly supports very long video generations based on an audio track. Exists in two flavors (*Single Speaker* and *Multi Speakers*) but doesnt seem to be compatible with Vace. One key new feature compared to Multitalk is that you can have different visual shots associated to the same audio: each Reference frame you provide you will be associated to a new Sliding Window. If only Reference frame is provided, it will be used for all windows. When Continuing a video, you can either continue the current shot (no Reference Frame) or add new shots (one or more Reference Frames).\
|
||||
If you are not into audio, you can use still this model to generate infinite long image2video, just select "no speaker". Last but not least, Infinitetalk works works with all the Loras accelerators.
|
||||
- **Flux Chroma 1 HD** support: uncensored flux based model and lighter than Flux (8.9B versus 12B) and can fit entirely in VRAM with only 16 GB of VRAM. Unfortunalely it is not distilled and you will need CFG at minimum 20 steps
|
||||
|
||||
### August 21 2025: WanGP v8.01 - the killer of seven
|
||||
|
||||
- **Qwen Image Edit** : Flux Kontext challenger (prompt driven image edition). Best results (including Identity preservation) will be obtained at 720p. Beyond you may get image outpainting and / or lose identity preservation. Below 720p prompt adherence will be worse. Qwen Image Edit works with Qwen Lora Lightning 4 steps. I have also unlocked all the resolutions for Qwen models. Bonus Zone: support for multiple image compositions but identity preservation won't be as good.
|
||||
- **On demand Prompt Enhancer** (needs to be enabled in Configuration Tab) that you can use to Enhance a Text Prompt before starting a Generation. You can refine the Enhanced Prompt or change the original Prompt.
|
||||
- Choice of a **Non censored Prompt Enhancer**. Beware this is one is VRAM hungry and will require 12 GB of VRAM to work
|
||||
- **Memory Profile customizable per model** : useful to set for instance Profile 3 (preload the model entirely in VRAM) with only Image Generation models, if you have 24 GB of VRAM. In that case Generation will be much faster because with Image generators (contrary to Video generators) as a lot of time is wasted in offloading
|
||||
- **Expert Guidance Mode**: change the Guidance during the generation up to 2 times. Very useful with Wan 2.2 Ligthning to reduce the slow motion effect. The idea is to insert a CFG phase before the 2 accelerated phases that follow and have no Guidance. I have added the finetune *Wan2.2 Vace Lightning 3 Phases 14B* with a prebuilt configuration. Please note that it is a 8 steps process although the lora lightning is 4 steps. This expert guidance mode is also available with Wan 2.1.
|
||||
|
||||
*WanGP 8.01 update, improved Qwen Image Edit Identity Preservation*
|
||||
### August 12 2025: WanGP v7.7777 - Lucky Day(s)
|
||||
|
||||
This is your lucky day ! thanks to new configuration options that will let you store generated Videos and Images in lossless compressed formats, you will find they in fact they look two times better without doing anything !
|
||||
|
||||
Just kidding, they will be only marginally better, but at least this opens the way to professionnal editing.
|
||||
|
||||
Support:
|
||||
- Video: x264, x264 lossless, x265
|
||||
- Images: jpeg, png, webp, wbp lossless
|
||||
Generation Settings are stored in each of the above regardless of the format (that was the hard part).
|
||||
|
||||
Also you can now choose different output directories for images and videos.
|
||||
|
||||
unexpected luck: fixed lightning 8 steps for Qwen, and lightning 4 steps for Wan 2.2, now you just need 1x multiplier no weird numbers.
|
||||
*update 7.777 : oops got a crash a with FastWan ? Luck comes and goes, try a new update, maybe you will have a better chance this time*
|
||||
*update 7.7777 : Sometime good luck seems to last forever. For instance what if Qwen Lightning 4 steps could also work with WanGP ?*
|
||||
- https://huggingface.co/lightx2v/Qwen-Image-Lightning/resolve/main/Qwen-Image-Lightning-4steps-V1.0-bf16.safetensors (Qwen Lightning 4 steps)
|
||||
- https://huggingface.co/lightx2v/Qwen-Image-Lightning/resolve/main/Qwen-Image-Lightning-8steps-V1.1-bf16.safetensors (new improved version of Qwen Lightning 8 steps)
|
||||
|
||||
|
||||
### August 10 2025: WanGP v7.76 - Faster than the VAE ...
|
||||
We have a funny one here today: FastWan 2.2 5B, the Fastest Video Generator, only 20s to generate 121 frames at 720p. The snag is that VAE is twice as slow...
|
||||
Thanks to Kijai for extracting the Lora that is used to build the corresponding finetune.
|
||||
|
||||
*WanGP 7.76: fixed the messed up I did to i2v models (loras path was wrong for Wan2.2 and Clip broken)*
|
||||
|
||||
### August 9 2025: WanGP v7.74 - Qwen Rebirth part 2
|
||||
Added support for Qwen Lightning lora for a 8 steps generation (https://huggingface.co/lightx2v/Qwen-Image-Lightning/blob/main/Qwen-Image-Lightning-8steps-V1.0.safetensors). Lora is not normalized and you can use a multiplier around 0.1.
|
||||
|
||||
Mag Cache support for all the Wan2.2 models Don't forget to set guidance to 1 and 8 denoising steps , your gen will be 7x faster !
|
||||
|
||||
### August 8 2025: WanGP v7.73 - Qwen Rebirth
|
||||
Ever wondered what impact not using Guidance has on a model that expects it ? Just look at Qween Image in WanGP 7.71 whose outputs were erratic. Somehow I had convinced myself that Qwen was a distilled model. In fact Qwen was dying for a negative prompt. And in WanGP 7.72 there is at last one for him.
|
||||
|
||||
As Qwen is not so picky after all I have added also quantized text encoder which reduces the RAM requirements of Qwen by 10 GB (the text encoder quantized version produced garbage before)
|
||||
|
||||
Unfortunately still the Sage bug for older GPU architectures. Added Sdpa fallback for these architectures.
|
||||
|
||||
*7.73 update: still Sage / Sage2 bug for GPUs before RTX40xx. I have added a detection mechanism that forces Sdpa attention if that's the case*
|
||||
|
||||
|
||||
### August 6 2025: WanGP v7.71 - Picky, picky
|
||||
|
||||
This release comes with two new models :
|
||||
- Qwen Image: a Commercial grade Image generator capable to inject full sentences in the generated Image while still offering incredible visuals
|
||||
- Wan 2.2 TextImage to Video 5B: the last Wan 2.2 needed if you want to complete your Wan 2.2 collection (loras for this folder can be stored in "\loras\5B" )
|
||||
|
||||
There is catch though, they are very picky if you want to get good generations: first they both need lots of steps (50 ?) to show what they have to offer. Then for Qwen Image I had to hardcode the supported resolutions, because if you try anything else, you will get garbage. Likewise Wan 2.2 5B will remind you of Wan 1.0 if you don't ask for at least 720p.
|
||||
|
||||
*7.71 update: Added VAE Tiling for both Qwen Image and Wan 2.2 TextImage to Video 5B, for low VRAM during a whole gen.*
|
||||
|
||||
|
||||
### August 4 2025: WanGP v7.6 - Remuxed
|
||||
|
||||
With this new version you won't have any excuse if there is no sound in your video.
|
||||
|
||||
*Continue Video* now works with any video that has already some sound (hint: Multitalk ).
|
||||
|
||||
Also, on top of MMaudio and the various sound driven models I have added the ability to use your own soundtrack.
|
||||
|
||||
As a result you can apply a different sound source on each new video segment when doing a *Continue Video*.
|
||||
|
||||
For instance:
|
||||
- first video part: use Multitalk with two people speaking
|
||||
- second video part: you apply your own soundtrack which will gently follow the multitalk conversation
|
||||
- third video part: you use Vace effect and its corresponding control audio will be concatenated to the rest of the audio
|
||||
|
||||
To multiply the combinations I have also implemented *Continue Video* with the various image2video models.
|
||||
|
||||
Also:
|
||||
- End Frame support added for LTX Video models
|
||||
- Loras can now be targetted specifically at the High noise or Low noise models with Wan 2.2, check the Loras and Finetune guides
|
||||
- Flux Krea Dev support
|
||||
|
||||
### July 30 2025: WanGP v7.5: Just another release ... Wan 2.2 part 2
|
||||
Here is now Wan 2.2 image2video a very good model if you want to set Start and End frames. Two Wan 2.2 models delivered, only one to go ...
|
||||
|
||||
Please note that although it is an image2video model it is structurally very close to Wan 2.2 text2video (same layers with only a different initial projection). Given that Wan 2.1 image2video loras don't work too well (half of their tensors are not supported), I have decided that this model will look for its loras in the text2video loras folder instead of the image2video folder.
|
||||
|
||||
I have also optimized RAM management with Wan 2.2 so that loras and modules will be loaded only once in RAM and Reserved RAM, this saves up to 5 GB of RAM which can make a difference...
|
||||
|
||||
And this time I really removed Vace Cocktail Light which gave a blurry vision.
|
||||
|
||||
### July 29 2025: WanGP v7.4: Just another release ... Wan 2.2 Preview
|
||||
Wan 2.2 is here. The good news is that WanGP wont require a single byte of extra VRAM to run it and it will be as fast as Wan 2.1. The bad news is that you will need much more RAM if you want to leverage entirely this new model since it has twice has many parameters.
|
||||
|
||||
So here is a preview version of Wan 2.2 that is without the 5B model and Wan 2.2 image to video for the moment.
|
||||
|
||||
However as I felt bad to deliver only half of the wares, I gave you instead .....** Wan 2.2 Vace Experimental Cocktail** !
|
||||
|
||||
Very good surprise indeed, the loras and Vace partially work with Wan 2.2. We will need to wait for the official Vace 2.2 release since some Vace features are broken like identity preservation
|
||||
|
||||
Bonus zone: Flux multi images conditions has been added, or maybe not if I broke everything as I have been distracted by Wan...
|
||||
|
||||
7.4 update: I forgot to update the version number. I also removed Vace Cocktail light which didnt work well.
|
||||
|
||||
### July 27 2025: WanGP v7.3 : Interlude
|
||||
While waiting for Wan 2.2, you will appreciate the model selection hierarchy which is very useful to collect even more models. You will also appreciate that WanGP remembers which model you used last in each model family.
|
||||
|
||||
### July 26 2025: WanGP v7.2 : Ode to Vace
|
||||
I am really convinced that Vace can do everything the other models can do and in a better way especially as Vace can be combined with Multitalk.
|
||||
|
||||
Here are some new Vace improvements:
|
||||
- I have provided a default finetune named *Vace Cocktail* which is a model created on the fly using the Wan text 2 video model and the Loras used to build FusioniX. The weight of the *Detail Enhancer* Lora has been reduced to improve identity preservation. Copy the model definition in *defaults/vace_14B_cocktail.json* in the *finetunes/* folder to change the Cocktail composition. Cocktail contains already some Loras acccelerators so no need to add on top a Lora Accvid, Causvid or Fusionix, ... . The whole point of Cocktail is to be able to build you own FusioniX (which originally is a combination of 4 loras) but without the inconvenient of FusioniX.
|
||||
- Talking about identity preservation, it tends to go away when one generates a single Frame instead of a Video which is shame for our Vace photoshop. But there is a solution : I have added an Advanced Quality option, that tells WanGP to generate a little more than a frame (it will still keep only the first frame). It will be a little slower but you will be amazed how Vace Cocktail combined with this option will preserve identities (bye bye *Phantom*).
|
||||
- As in practise I have observed one switches frequently between *Vace text2video* and *Vace text2image* I have put them in the same place they are now just one tab away, no need to reload the model. Likewise *Wan text2video* and *Wan tex2image* have been merged.
|
||||
- Color fixing when using Sliding Windows. A new postprocessing *Color Correction* applied automatically by default (you can disable it in the *Advanced tab Sliding Window*) will try to match the colors of the new window with that of the previous window. It doesnt fix all the unwanted artifacts of the new window but at least this makes the transition smoother. Thanks to the multitalk team for the original code.
|
||||
|
||||
Also you will enjoy our new real time statistics (CPU / GPU usage, RAM / VRAM used, ... ). Many thanks to **Redtash1** for providing the framework for this new feature ! You need to go in the Config tab to enable real time stats.
|
||||
|
||||
|
||||
### July 21 2025: WanGP v7.12
|
||||
- Flux Family Reunion : *Flux Dev* and *Flux Schnell* have been invited aboard WanGP. To celebrate that, Loras support for the Flux *diffusers* format has also been added.
|
||||
|
||||
- LTX Video upgraded to version 0.9.8: you can now generate 1800 frames (1 min of video !) in one go without a sliding window. With the distilled model it will take only 5 minutes with a RTX 4090 (you will need 22 GB of VRAM though). I have added options to select higher humber frames if you want to experiment (go to Configuration Tab / General / Increase the Max Number of Frames, change the value and restart the App)
|
||||
|
||||
- LTX Video ControlNet : it is a Control Net that allows you for instance to transfer a Human motion or Depth from a control video. It is not as powerful as Vace but can produce interesting things especially as now you can generate quickly a 1 min video. Under the scene IC-Loras (see below) for Pose, Depth and Canny are automatically loaded for you, no need to add them.
|
||||
|
||||
- LTX IC-Lora support: these are special Loras that consumes a conditional image or video
|
||||
Beside the pose, depth and canny IC-Loras transparently loaded there is the *detailer* (https://huggingface.co/Lightricks/LTX-Video-ICLoRA-detailer-13b-0.9.8) which is basically an upsampler. Add the *detailer* as a Lora and use LTX Raw Format as control net choice to use it.
|
||||
|
||||
- Matanyone is now also for the GPU Poor as its VRAM requirements have been divided by 2! (7.12 shadow update)
|
||||
|
||||
- Easier way to select video resolution
|
||||
|
||||
|
||||
### July 15 2025: WanGP v7.0 is an AI Powered Photoshop
|
||||
This release turns the Wan models into Image Generators. This goes way more than allowing to generate a video made of single frame :
|
||||
- Multiple Images generated at the same time so that you can choose the one you like best.It is Highly VRAM optimized so that you can generate for instance 4 720p Images at the same time with less than 10 GB
|
||||
@ -86,84 +236,6 @@ Taking care of your life is not enough, you want new stuff to play with ?
|
||||
|
||||
**If you had upgraded to v6.5 please upgrade again to 6.5.1 as this will fix a bug that ignored Loras beyond the first one**
|
||||
|
||||
### June 23 2025: WanGP v6.3, Vace Unleashed. Thought we couldnt squeeze Vace even more ?
|
||||
- Multithreaded preprocessing when possible for faster generations
|
||||
- Multithreaded frames Lanczos Upsampling as a bonus
|
||||
- A new Vace preprocessor : *Flow* to extract fluid motion
|
||||
- Multi Vace Controlnets: you can now transfer several properties at the same time. This opens new possibilities to explore, for instance if you transfer *Human Movement* and *Shapes* at the same time for some reasons the lighting of your character will take into account much more the environment of your character.
|
||||
- Injected Frames Outpainting, in case you missed it in WanGP 6.21
|
||||
|
||||
Don't know how to use all of the Vace features ? Check the Vace Guide embedded in WanGP as it has also been updated.
|
||||
|
||||
|
||||
### June 19 2025: WanGP v6.2, Vace even more Powercharged
|
||||
👋 Have I told you that I am a big fan of Vace ? Here are more goodies to unleash its power:
|
||||
- If you ever wanted to watch Star Wars in 4:3, just use the new *Outpainting* feature and it will add the missing bits of image at the top and the bottom of the screen. The best thing is *Outpainting* can be combined with all the other Vace modifications, for instance you can change the main character of your favorite movie at the same time
|
||||
- More processing can combined at the same time (for instance the depth process can be applied outside the mask)
|
||||
- Upgraded the depth extractor to Depth Anything 2 which is much more detailed
|
||||
|
||||
As a bonus, I have added two finetunes based on the Safe-Forcing technology (which requires only 4 steps to generate a video): Wan 2.1 text2video Self-Forcing and Vace Self-Forcing. I know there is Lora around but the quality of the Lora is worse (at least with Vace) compared to the full model. Don't hesitate to share your opinion about this on the discord server.
|
||||
### June 17 2025: WanGP v6.1, Vace Powercharged
|
||||
👋 Lots of improvements for Vace the Mother of all Models:
|
||||
- masks can now be combined with on the fly processing of a control video, for instance you can extract the motion of a specific person defined by a mask
|
||||
- on the fly modification of masks : reversed masks (with the same mask you can modify the background instead of the people covered by the masks), enlarged masks (you can cover more area if for instance the person you are trying to inject is larger than the one in the mask), ...
|
||||
- view these modified masks directly inside WanGP during the video generation to check they are really as expected
|
||||
- multiple frames injections: multiples frames can be injected at any location of the video
|
||||
- expand past videos in on click: just select one generated video to expand it
|
||||
|
||||
Of course all these new stuff work on all Vace finetunes (including Vace Fusionix).
|
||||
|
||||
Thanks also to Reevoy24 for adding a Notfication sound at the end of a generation and for fixing the background color of the current generation summary.
|
||||
|
||||
### June 12 2025: WanGP v6.0
|
||||
👋 *Finetune models*: You find the 20 models supported by WanGP not sufficient ? Too impatient to wait for the next release to get the support for a newly released model ? Your prayers have been answered: if a new model is compatible with a model architecture supported by WanGP, you can add by yourself the support for this model in WanGP by just creating a finetune model definition. You can then store this model in the cloud (for instance in Huggingface) and the very light finetune definition file can be easily shared with other users. WanGP will download automatically the finetuned model for them.
|
||||
|
||||
To celebrate the new finetunes support, here are a few finetune gifts (directly accessible from the model selection menu):
|
||||
- *Fast Hunyuan Video* : generate model t2v in only 6 steps
|
||||
- *Hunyuan Vido AccVideo* : generate model t2v in only 5 steps
|
||||
- *Wan FusioniX*: it is a combo of AccVideo / CausVid ans other models and can generate high quality Wan videos in only 8 steps
|
||||
|
||||
One more thing...
|
||||
|
||||
The new finetune system can be used to combine complementaty models : what happens when you combine Fusionix Text2Video and Vace Control Net ?
|
||||
|
||||
You get **Vace FusioniX**: the Ultimate Vace Model, Fast (10 steps, no need for guidance) and with a much better quality Video than the original slower model (despite being the best Control Net out there). Here goes one more finetune...
|
||||
|
||||
Check the *Finetune Guide* to create finetune models definitions and share them on the WanGP discord server.
|
||||
|
||||
### June 11 2025: WanGP v5.5
|
||||
👋 *Hunyuan Video Custom Audio*: it is similar to Hunyuan Video Avatar except there isn't any lower limit on the number of frames and you can use your reference images in a different context than the image itself\
|
||||
*Hunyuan Video Custom Edit*: Hunyuan Video Controlnet, use it to do inpainting and replace a person in a video while still keeping his poses. Similar to Vace but less restricted than the Wan models in terms of content...
|
||||
|
||||
|
||||
### June 6 2025: WanGP v5.41
|
||||
👋 Bonus release: Support for **AccVideo** Lora to speed up x2 Video generations in Wan models. Check the Loras documentation to get the usage instructions of AccVideo.\
|
||||
You will need to do a *pip install -r requirements.txt*
|
||||
|
||||
### June 6 2025: WanGP v5.4
|
||||
👋 World Exclusive : **Hunyuan Video Avatar** Support ! You won't need 80 GB of VRAM nor 32 GB oF VRAM, just 10 GB of VRAM will be sufficient to generate up to 15s of high quality speech / song driven Video at a high speed with no quality degradation. Support for TeaCache included.\
|
||||
Here is a link to the original repo where you will find some very interesting documentation and examples. https://github.com/Tencent-Hunyuan/HunyuanVideo-Avatar. Kudos to the Hunyuan Video Avatar team for the best model of its kind.\
|
||||
Also many thanks to Reevoy24 for his repackaging / completing the documentation
|
||||
|
||||
### May 28 2025: WanGP v5.31
|
||||
👋 Added **Phantom 14B**, a model that you can use to transfer objects / people in the video. My preference goes to Vace that remains the king of controlnets.
|
||||
VACE improvements: Better sliding window transitions, image mask support in Matanyone, new Extend Video feature, and enhanced background removal options.
|
||||
|
||||
### May 26, 2025: WanGP v5.3
|
||||
👋 Settings management revolution! Now you can:
|
||||
- Select any generated video and click *Use Selected Video Settings* to instantly reuse its configuration
|
||||
- Drag & drop videos to automatically extract their settings metadata
|
||||
- Export/import settings as JSON files for easy sharing and backup
|
||||
|
||||
### May 20, 2025: WanGP v5.2
|
||||
👋 **CausVid support** - Generate videos in just 4-12 steps with the new distilled Wan model! Also added experimental MoviiGen for 1080p generation (20GB+ VRAM required). Check the Loras documentation to get the usage instructions of CausVid.
|
||||
|
||||
### May 18, 2025: WanGP v5.1
|
||||
👋 **LTX Video 13B Distilled** - Generate high-quality videos in less than one minute!
|
||||
|
||||
### May 17, 2025: WanGP v5.0
|
||||
👋 **One App to Rule Them All!** Added Hunyuan Video and LTX Video support, plus Vace 14B and integrated prompt enhancer.
|
||||
|
||||
See full changelog: **[Changelog](docs/CHANGELOG.md)**
|
||||
|
||||
## 📋 Table of Contents
|
||||
@ -211,7 +283,7 @@ git clone https://github.com/deepbeepmeep/Wan2GP.git
|
||||
cd Wan2GP
|
||||
conda create -n wan2gp python=3.10.9
|
||||
conda activate wan2gp
|
||||
pip install torch==2.7.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/test/cu124
|
||||
pip install torch==2.7.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/test/cu128
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
@ -229,6 +301,7 @@ git pull
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
|
||||
## 📦 Installation
|
||||
|
||||
For detailed installation instructions for different GPU generations:
|
||||
@ -251,6 +324,12 @@ For detailed installation instructions for different GPU generations:
|
||||
- **[Changelog](docs/CHANGELOG.md)** - Latest updates and version history
|
||||
- **[Troubleshooting](docs/TROUBLESHOOTING.md)** - Common issues and solutions
|
||||
|
||||
## 📚 Video Guides
|
||||
- Nice Video that explain how to use Vace:\
|
||||
https://www.youtube.com/watch?v=FMo9oN2EAvE
|
||||
- Another Vace guide:\
|
||||
https://www.youtube.com/watch?v=T5jNiEhf9xk
|
||||
|
||||
## 🔗 Related Projects
|
||||
|
||||
### Other Models for the GPU Poor
|
||||
|
||||
|
Before Width: | Height: | Size: 1.7 MiB |
|
Before Width: | Height: | Size: 516 KiB |
|
Before Width: | Height: | Size: 871 KiB |
BIN
assets/logo.png
|
Before Width: | Height: | Size: 55 KiB |
|
Before Width: | Height: | Size: 294 KiB |
|
Before Width: | Height: | Size: 1.5 MiB |
|
Before Width: | Height: | Size: 628 KiB |
|
Before Width: | Height: | Size: 208 KiB |
14
configs/i2v_2_2.json
Normal file
@ -0,0 +1,14 @@
|
||||
{
|
||||
"_class_name": "WanModel",
|
||||
"_diffusers_version": "0.33.0",
|
||||
"dim": 5120,
|
||||
"eps": 1e-06,
|
||||
"ffn_dim": 13824,
|
||||
"freq_dim": 256,
|
||||
"in_dim": 36,
|
||||
"model_type": "i2v2_2",
|
||||
"num_heads": 40,
|
||||
"num_layers": 40,
|
||||
"out_dim": 16,
|
||||
"text_len": 512
|
||||
}
|
||||
15
configs/i2v_2_2_multitalk.json
Normal file
@ -0,0 +1,15 @@
|
||||
{
|
||||
"_class_name": "WanModel",
|
||||
"_diffusers_version": "0.33.0",
|
||||
"dim": 5120,
|
||||
"eps": 1e-06,
|
||||
"ffn_dim": 13824,
|
||||
"freq_dim": 256,
|
||||
"in_dim": 36,
|
||||
"model_type": "i2v2_2",
|
||||
"num_heads": 40,
|
||||
"num_layers": 40,
|
||||
"out_dim": 16,
|
||||
"text_len": 512,
|
||||
"multitalk_output_dim": 768
|
||||
}
|
||||
@ -10,5 +10,6 @@
|
||||
"num_heads": 40,
|
||||
"num_layers": 40,
|
||||
"out_dim": 16,
|
||||
"text_len": 512
|
||||
"text_len": 512,
|
||||
"multitalk_output_dim": 768
|
||||
}
|
||||
18
configs/qwen_image_20B.json
Normal file
@ -0,0 +1,18 @@
|
||||
{
|
||||
"_class_name": "QwenImageTransformer2DModel",
|
||||
"_diffusers_version": "0.34.0.dev0",
|
||||
"attention_head_dim": 128,
|
||||
"axes_dims_rope": [
|
||||
16,
|
||||
56,
|
||||
56
|
||||
],
|
||||
"guidance_embeds": false,
|
||||
"in_channels": 64,
|
||||
"joint_attention_dim": 3584,
|
||||
"num_attention_heads": 24,
|
||||
"num_layers": 60,
|
||||
"out_channels": 16,
|
||||
"patch_size": 2,
|
||||
"pooled_projection_dim": 768
|
||||
}
|
||||
15
configs/standin.json
Normal file
@ -0,0 +1,15 @@
|
||||
{
|
||||
"_class_name": "WanModel",
|
||||
"_diffusers_version": "0.30.0",
|
||||
"dim": 5120,
|
||||
"eps": 1e-06,
|
||||
"ffn_dim": 13824,
|
||||
"freq_dim": 256,
|
||||
"in_dim": 16,
|
||||
"model_type": "t2v",
|
||||
"num_heads": 40,
|
||||
"num_layers": 40,
|
||||
"out_dim": 16,
|
||||
"text_len": 512,
|
||||
"standin": true
|
||||
}
|
||||
14
configs/ti2v_2_2.json
Normal file
@ -0,0 +1,14 @@
|
||||
{
|
||||
"_class_name": "WanModel",
|
||||
"_diffusers_version": "0.33.0",
|
||||
"dim": 3072,
|
||||
"eps": 1e-06,
|
||||
"ffn_dim": 14336,
|
||||
"freq_dim": 256,
|
||||
"in_dim": 48,
|
||||
"model_type": "ti2v2_2",
|
||||
"num_heads": 24,
|
||||
"num_layers": 30,
|
||||
"out_dim": 48,
|
||||
"text_len": 512
|
||||
}
|
||||
17
configs/vace_standin_14B.json
Normal file
@ -0,0 +1,17 @@
|
||||
{
|
||||
"_class_name": "VaceWanModel",
|
||||
"_diffusers_version": "0.30.0",
|
||||
"dim": 5120,
|
||||
"eps": 1e-06,
|
||||
"ffn_dim": 13824,
|
||||
"freq_dim": 256,
|
||||
"in_dim": 16,
|
||||
"model_type": "t2v",
|
||||
"num_heads": 40,
|
||||
"num_layers": 40,
|
||||
"out_dim": 16,
|
||||
"text_len": 512,
|
||||
"vace_layers": [0, 5, 10, 15, 20, 25, 30, 35],
|
||||
"vace_in_dim": 96,
|
||||
"standin": true
|
||||
}
|
||||
@ -3,10 +3,9 @@
|
||||
{
|
||||
"name": "Fantasy Talking 720p",
|
||||
"architecture" : "fantasy",
|
||||
"modules": ["fantasy"],
|
||||
"modules": [ ["https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/wan2.1_fantasy_speaking_14B_bf16.safetensors"]],
|
||||
"description": "The Fantasy Talking model corresponds to the original Wan image 2 video model combined with the Fantasy Speaking module to process an audio Input.",
|
||||
"URLs": "i2v_720p",
|
||||
"teacache_coefficients" : [-114.36346466, 65.26524496, -18.82220707, 4.91518089, -0.23412683]
|
||||
"URLs": "i2v_720p"
|
||||
},
|
||||
"resolution": "1280x720"
|
||||
}
|
||||
|
||||
16
defaults/flux.json
Normal file
@ -0,0 +1,16 @@
|
||||
{
|
||||
"model": {
|
||||
"name": "Flux 1 Dev 12B",
|
||||
"architecture": "flux",
|
||||
"description": "FLUX.1 Dev is a 12 billion parameter rectified flow transformer capable of generating images from text descriptions.",
|
||||
"URLs": [
|
||||
"https://huggingface.co/DeepBeepMeep/Flux/resolve/main/flux1-dev_bf16.safetensors",
|
||||
"https://huggingface.co/DeepBeepMeep/Flux/resolve/main/flux1-dev_quanto_bf16_int8.safetensors"
|
||||
],
|
||||
"image_outputs": true,
|
||||
"flux-model": "flux-dev"
|
||||
},
|
||||
"prompt": "draw a hat",
|
||||
"resolution": "1280x720",
|
||||
"batch_size": 1
|
||||
}
|
||||
18
defaults/flux_chroma.json
Normal file
@ -0,0 +1,18 @@
|
||||
{
|
||||
"model": {
|
||||
"name": "Flux 1 Chroma 1 HD 8.9B",
|
||||
"architecture": "flux",
|
||||
"description": "FLUX.1 Chroma is a 8.9 billion parameters model. As a base model, Chroma1 is intentionally designed to be an excellent starting point for finetuning. It provides a strong, neutral foundation for developers, researchers, and artists to create specialized models..",
|
||||
"URLs": [
|
||||
"https://huggingface.co/DeepBeepMeep/Flux/resolve/main/flux1-chroma_hd_bf16.safetensors",
|
||||
"https://huggingface.co/DeepBeepMeep/Flux/resolve/main/flux1-chroma_hd_quanto_bf16_int8.safetensors"
|
||||
],
|
||||
"image_outputs": true,
|
||||
"flux-model": "flux-chroma"
|
||||
},
|
||||
"prompt": "draw a hat",
|
||||
"resolution": "1280x720",
|
||||
"guidance_scale": 3.0,
|
||||
"num_inference_steps": 20,
|
||||
"batch_size": 1
|
||||
}
|
||||
@ -1,16 +1,19 @@
|
||||
{
|
||||
"model": {
|
||||
"name": "Flux Dev Kontext 12B",
|
||||
"architecture": "flux_dev_kontext",
|
||||
"description": "FLUX.1 Kontext is a 12 billion parameter rectified flow transformer capable of editing images based on instructions stored in the Prompt. Please be aware that Flux Kontext is picky on the resolution of the input image the output dimensions may not match the dimensions of the input image.",
|
||||
"name": "Flux 1 Dev Kontext 12B",
|
||||
"architecture": "flux",
|
||||
"description": "FLUX.1 Kontext is a 12 billion parameter rectified flow transformer capable of editing images based on instructions stored in the Prompt. Please be aware that Flux Kontext is picky on the resolution of the input image and the output dimensions may not match the dimensions of the input image.",
|
||||
"URLs": [
|
||||
"https://huggingface.co/DeepBeepMeep/Flux/resolve/main/flux1_kontext_dev_bf16.safetensors",
|
||||
"https://huggingface.co/DeepBeepMeep/Flux/resolve/main/flux1_kontext_dev_quanto_bf16_int8.safetensors"
|
||||
]
|
||||
],
|
||||
"image_outputs": true,
|
||||
"reference_image": true,
|
||||
"flux-model": "flux-dev-kontext"
|
||||
},
|
||||
"prompt": "add a hat",
|
||||
"resolution": "1280x720",
|
||||
"video_length": 1
|
||||
"batch_size": 1
|
||||
}
|
||||
|
||||
|
||||
19
defaults/flux_dev_uso.json
Normal file
@ -0,0 +1,19 @@
|
||||
{
|
||||
"model": {
|
||||
"name": "Flux 1 Dev USO 12B",
|
||||
"architecture": "flux",
|
||||
"description": "FLUX.1 Dev USO is a model specialized to Edit Images with a specialization in Style Transfers (up to two).",
|
||||
"modules": [ ["https://huggingface.co/DeepBeepMeep/Flux/resolve/main/flux1-dev-USO_projector_bf16.safetensors"]],
|
||||
"URLs": "flux",
|
||||
"loras": ["https://huggingface.co/DeepBeepMeep/Flux/resolve/main/flux1-dev-USO_dit_lora_bf16.safetensors"],
|
||||
"image_outputs": true,
|
||||
"reference_image": true,
|
||||
"flux-model": "flux-dev-uso"
|
||||
},
|
||||
"prompt": "add a hat",
|
||||
"embedded_guidance_scale": 4,
|
||||
"resolution": "1024x1024",
|
||||
"batch_size": 1
|
||||
}
|
||||
|
||||
|
||||
16
defaults/flux_krea.json
Normal file
@ -0,0 +1,16 @@
|
||||
{
|
||||
"model": {
|
||||
"name": "Flux 1 Krea Dev 12B",
|
||||
"architecture": "flux",
|
||||
"description": "Cutting-edge output quality, with a focus on aesthetic photography..",
|
||||
"URLs": [
|
||||
"https://huggingface.co/DeepBeepMeep/Flux/resolve/main/flux1-krea-dev_bf16.safetensors",
|
||||
"https://huggingface.co/DeepBeepMeep/Flux/resolve/main/flux1-krea-dev_quanto_bf16_int8.safetensors"
|
||||
],
|
||||
"image_outputs": true,
|
||||
"flux-model": "flux-dev"
|
||||
},
|
||||
"prompt": "draw a hat",
|
||||
"resolution": "1280x720",
|
||||
"batch_size": 1
|
||||
}
|
||||
17
defaults/flux_schnell.json
Normal file
@ -0,0 +1,17 @@
|
||||
{
|
||||
"model": {
|
||||
"name": "Flux 1 Schnell 12B",
|
||||
"architecture": "flux",
|
||||
"description": "FLUX.1 Schnell is a 12 billion parameter rectified flow transformer capable of generating images from text descriptions. As a distilled model it requires fewer denoising steps.",
|
||||
"URLs": [
|
||||
"https://huggingface.co/DeepBeepMeep/Flux/resolve/main/flux1-schnell_bf16.safetensors",
|
||||
"https://huggingface.co/DeepBeepMeep/Flux/resolve/main/flux1-schnell_quanto_bf16_int8.safetensors"
|
||||
],
|
||||
"image_outputs": true,
|
||||
"flux-model": "flux-schnell"
|
||||
},
|
||||
"prompt": "draw a hat",
|
||||
"resolution": "1280x720",
|
||||
"num_inference_steps": 10,
|
||||
"batch_size": 1
|
||||
}
|
||||
@ -1,11 +1,11 @@
|
||||
{
|
||||
"model":
|
||||
{
|
||||
"name": "Hunyuan Video text2video 720p 13B",
|
||||
"name": "Hunyuan Video Text2video 720p 13B",
|
||||
"architecture" : "hunyuan",
|
||||
"description": "Probably the best text 2 video model available.",
|
||||
"URLs": [
|
||||
"https://huggingface.co/DeepBeepMeep/HunyuanVideo/resolve/main/hunyuan_video_720_bf16.safetensors.safetensors",
|
||||
"https://huggingface.co/DeepBeepMeep/HunyuanVideo/resolve/main/hunyuan_video_720_bf16.safetensors",
|
||||
"https://huggingface.co/DeepBeepMeep/HunyuanVideo/resolve/main/hunyuan_video_720_quanto_int8.safetensors"
|
||||
]
|
||||
}
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
{
|
||||
"model":
|
||||
{
|
||||
"name": "Hunyuan Video image2video 720p 13B",
|
||||
"name": "Hunyuan Video Image2video 720p 13B",
|
||||
"architecture" : "hunyuan_i2v",
|
||||
"description": "A good looking image 2 video model, but not so good in prompt adherence.",
|
||||
"URLs": [
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
{
|
||||
"model": {
|
||||
"name": "Hunyuan AccVideo 720p 13B",
|
||||
"name": "Hunyuan Video AccVideo 720p 13B",
|
||||
"architecture": "hunyuan",
|
||||
"description": " AccVideo is a novel efficient distillation method to accelerate video diffusion models with synthetic datset. Our method is 8.5x faster than HunyuanVideo.",
|
||||
"URLs": [
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
{
|
||||
"model": {
|
||||
"name": "Hunyuan Fast Video 720p 13B",
|
||||
"name": "Hunyuan Video FastHunyuan 720p 13B",
|
||||
"architecture": "hunyuan",
|
||||
"description": "Fast Hunyuan is an accelerated HunyuanVideo model. It can sample high quality videos with 6 diffusion steps.",
|
||||
"URLs": [
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
{
|
||||
"model":
|
||||
{
|
||||
"name": "Wan2.1 image2video 480p 14B",
|
||||
"name": "Wan2.1 Image2video 480p 14B",
|
||||
"architecture" : "i2v",
|
||||
"description": "The standard Wan Image 2 Video specialized to generate 480p images. It also offers Start and End Image support (End Image is not supported in the original model but seems to work well)",
|
||||
"URLs": [
|
||||
|
||||
25
defaults/i2v_2_2.json
Normal file
@ -0,0 +1,25 @@
|
||||
{
|
||||
"model":
|
||||
{
|
||||
"name": "Wan2.2 Image2video 14B",
|
||||
"architecture" : "i2v_2_2",
|
||||
"description": "Wan 2.2 Image 2 Video model. Contrary to the Wan Image2video 2.1 this model is structurally close to the t2v model. You will need consequently to store Loras for this model in the t2v Lora Folder.",
|
||||
"URLs": [
|
||||
"https://huggingface.co/DeepBeepMeep/Wan2.2/resolve/main/wan2.2_image2video_14B_high_mbf16.safetensors",
|
||||
"https://huggingface.co/DeepBeepMeep/Wan2.2/resolve/main/wan2.2_image2video_14B_high_quanto_mbf16_int8.safetensors",
|
||||
"https://huggingface.co/DeepBeepMeep/Wan2.2/resolve/main/wan2.2_image2video_14B_high_quanto_mfp16_int8.safetensors"
|
||||
],
|
||||
"URLs2": [
|
||||
"https://huggingface.co/DeepBeepMeep/Wan2.2/resolve/main/wan2.2_image2video_14B_low_mbf16.safetensors",
|
||||
"https://huggingface.co/DeepBeepMeep/Wan2.2/resolve/main/wan2.2_image2video_14B_low_quanto_mbf16_int8.safetensors",
|
||||
"https://huggingface.co/DeepBeepMeep/Wan2.2/resolve/main/wan2.2_image2video_14B_low_quanto_mfp16_int8.safetensors"
|
||||
],
|
||||
"group": "wan2_2"
|
||||
},
|
||||
"guidance_phases": 2,
|
||||
"switch_threshold" : 900,
|
||||
"guidance_scale" : 3.5,
|
||||
"guidance2_scale" : 3.5,
|
||||
"flow_shift" : 5
|
||||
|
||||
}
|
||||
18
defaults/i2v_2_2_multitalk.json
Normal file
@ -0,0 +1,18 @@
|
||||
{
|
||||
"model":
|
||||
{
|
||||
"name": "Wan2.2 Multitalk 14B",
|
||||
"architecture" : "i2v_2_2_multitalk",
|
||||
"description": "The Multitalk module of Wan 2.1 has been combined with the Wan 2.2 image 2 video. It lets you have up to two people have a conversation.",
|
||||
"modules": ["multitalk"],
|
||||
"URLs": "i2v_2_2",
|
||||
"URLs2": "i2v_2_2",
|
||||
"group": "wan2_2",
|
||||
"visible": false
|
||||
},
|
||||
"switch_threshold" : 900,
|
||||
"guidance_scale" : 3.5,
|
||||
"guidance2_scale" : 3.5,
|
||||
"flow_shift" : 5
|
||||
|
||||
}
|
||||
@ -1,7 +1,7 @@
|
||||
{
|
||||
"model":
|
||||
{
|
||||
"name": "Wan2.1 image2video 720p 14B",
|
||||
"name": "Wan2.1 Image2video 720p 14B",
|
||||
"architecture" : "i2v",
|
||||
"description": "The standard Wan Image 2 Video specialized to generate 720p images. It also offers Start and End Image support (End Image is not supported in the original model but seems to work well).",
|
||||
"URLs": [
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
{
|
||||
"model":
|
||||
{
|
||||
"name": "Wan2.1 image2video 480p FusioniX 14B",
|
||||
"name": "Wan2.1 Image2video 480p FusioniX 14B",
|
||||
"architecture" : "i2v",
|
||||
"description": "A powerful merged image-to-video model based on the original WAN 2.1 I2V model, enhanced using multiple open-source components and LoRAs to boost motion realism, temporal consistency, and expressive detail.",
|
||||
"URLs": "i2v",
|
||||
|
||||
16
defaults/infinitetalk.json
Normal file
@ -0,0 +1,16 @@
|
||||
{
|
||||
"model": {
|
||||
"name": "Infinitetalk Single Speaker 480p",
|
||||
"architecture": "infinitetalk",
|
||||
"modules": [
|
||||
[
|
||||
"https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/wan2.1_infinitetalk_single_14B_mbf16.safetensors",
|
||||
"https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/wan2.1_infinitetalk_single_14B_quanto_mbf16_int8.safetensors",
|
||||
"https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/wan2.1_infinitetalk_single_14B_quanto_mfp16_int8.safetensors"
|
||||
]
|
||||
],
|
||||
"description": "The Infinitetalk model is an improved version of Multitalk that supports very long videos. This is the single speaker version. Sliding Window size must be 81 frames to get smooth transitions between shots.",
|
||||
"one_speaker_only": true,
|
||||
"URLs": "i2v"
|
||||
}
|
||||
}
|
||||
16
defaults/infinitetalk_multi.json
Normal file
@ -0,0 +1,16 @@
|
||||
{
|
||||
"model": {
|
||||
"name": "Infinitetalk Multi Speakers 480p",
|
||||
"architecture": "infinitetalk",
|
||||
"modules": [
|
||||
[
|
||||
"https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/wan2.1_infinitetalk_multi_14B_mbf16.safetensors",
|
||||
"https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/wan2.1_infinitetalk_multi_14B_quanto_mfp16_int8.safetensors",
|
||||
"https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/wan2.1_infinitetalk_multi_14B_quanto_mbf16_int8.safetensors"
|
||||
]
|
||||
],
|
||||
"description": "The Infinitetalk model is an improved version of Multitalk that supports very long videos. This is the multi speakers version.Sliding Window size must be 81 frames to get smooth transitions between shots",
|
||||
"multi_speakers_only": true,
|
||||
"URLs": "i2v"
|
||||
}
|
||||
}
|
||||
@ -1,14 +1,19 @@
|
||||
{
|
||||
{
|
||||
"model":
|
||||
{
|
||||
"name": "LTX Video 0.9.7 13B",
|
||||
"name": "LTX Video 0.9.8 13B",
|
||||
"architecture" : "ltxv_13B",
|
||||
"description": "LTX Video is a fast model that can be used to generate long videos (up to 260 frames).It is recommended to keep the number of steps to 30 or you will need to update the file 'ltxv_video/configs/ltxv-13b-0.9.7-dev.yaml'.The LTX Video model expects very long prompts, so don't hesitate to use the Prompt Enhancer.",
|
||||
"description": "LTX Video is a fast model that can be used to generate very very long videos (up to 1800 frames !).It is recommended to keep the number of steps to 30 or you will need to update the file 'ltxv_video/configs/ltxv-13b-0.9.8-dev.yaml'.The LTX Video model expects very long prompts, so don't hesitate to use the Prompt Enhancer.",
|
||||
"URLs": [
|
||||
"https://huggingface.co/DeepBeepMeep/LTX_Video/resolve/main/ltxv_0.9.7_13B_dev_bf16.safetensors",
|
||||
"https://huggingface.co/DeepBeepMeep/LTX_Video/resolve/main/ltxv_0.9.7_13B_dev_quanto_bf16_int8.safetensors"
|
||||
"https://huggingface.co/DeepBeepMeep/LTX_Video/resolve/main/ltxv_0.9.8_13B_dev_bf16.safetensors",
|
||||
"https://huggingface.co/DeepBeepMeep/LTX_Video/resolve/main/ltxv_0.9.8_13B_dev_quanto_bf16_int8.safetensors"
|
||||
],
|
||||
"LTXV_config": "ltx_video/configs/ltxv-13b-0.9.7-dev.yaml"
|
||||
"preload_URLs" : [
|
||||
"https://huggingface.co/DeepBeepMeep/LTX_Video/resolve/main/ltxv-097-ic-lora-pose-control-diffusers.safetensors",
|
||||
"https://huggingface.co/DeepBeepMeep/LTX_Video/resolve/main/ltxv-097-ic-lora-depth-control-diffusers.safetensors",
|
||||
"https://huggingface.co/DeepBeepMeep/LTX_Video/resolve/main/ltxv-097-ic-lora-canny-control-diffusers.safetensors"
|
||||
],
|
||||
"LTXV_config": "models/ltx_video/configs/ltxv-13b-0.9.8-dev.yaml"
|
||||
},
|
||||
"num_inference_steps": 30
|
||||
}
|
||||
|
||||
@ -1,14 +1,15 @@
|
||||
{
|
||||
"model":
|
||||
{
|
||||
"name": "LTX Video 0.9.7 Distilled 13B",
|
||||
"name": "LTX Video 0.9.8 Distilled 13B",
|
||||
"architecture" : "ltxv_13B",
|
||||
"description": "LTX Video is a fast model that can be used to generate long videos (up to 260 frames).This distilled version is a very fast version and retains a high level of quality. The LTX Video model expects very long prompts, so don't hesitate to use the Prompt Enhancer.",
|
||||
"URLs": "ltxv_13B",
|
||||
"loras": ["https://huggingface.co/DeepBeepMeep/LTX_Video/resolve/main/ltxv_0.9.7_13B_distilled_lora128_bf16.safetensors"],
|
||||
"loras_multipliers": [ 1 ],
|
||||
"lock_inference_steps": true,
|
||||
"LTXV_config": "ltx_video/configs/ltxv-13b-0.9.7-distilled.yaml"
|
||||
"description": "LTX Video is a fast model that can be used to generate very long videos (up to 1800 frames !).This distilled version is a very fast version and retains a high level of quality. The LTX Video model expects very long prompts, so don't hesitate to use the Prompt Enhancer.",
|
||||
"URLs": [
|
||||
"https://huggingface.co/DeepBeepMeep/LTX_Video/resolve/main/ltxv_0.9.8_13B_distilled_bf16.safetensors",
|
||||
"https://huggingface.co/DeepBeepMeep/LTX_Video/resolve/main/ltxv_0.9.8_13B_distilled_quanto_bf16_int8.safetensors"
|
||||
],
|
||||
"preload_URLs" : "ltxv_13B",
|
||||
"LTXV_config": "models/ltx_video/configs/ltxv-13b-0.9.8-distilled.yaml"
|
||||
},
|
||||
"num_inference_steps": 6
|
||||
}
|
||||
|
||||
@ -3,7 +3,11 @@
|
||||
{
|
||||
"name": "Multitalk 480p",
|
||||
"architecture" : "multitalk",
|
||||
"modules": ["multitalk"],
|
||||
"modules": [
|
||||
["https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/wan2.1_multitalk_14B_mbf16.safetensors",
|
||||
"https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/wan2.1_multitalk_14B_quanto_mbf16_int8.safetensors",
|
||||
"https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/wan2.1_multitalk_14B_quanto_mfp16_int8.safetensors"]
|
||||
],
|
||||
"description": "The Multitalk model corresponds to the original Wan image 2 video model combined with the Multitalk module. It lets you have up to two people have a conversation.",
|
||||
"URLs": "i2v",
|
||||
"teacache_coefficients" : [-3.02331670e+02, 2.23948934e+02, -5.25463970e+01, 5.87348440e+00, -2.01973289e-01]
|
||||
|
||||
21
defaults/qwen_image_20B.json
Normal file
@ -0,0 +1,21 @@
|
||||
{
|
||||
"model": {
|
||||
"name": "Qwen Image 20B",
|
||||
"architecture": "qwen_image_20B",
|
||||
"description": "Qwen Image is generative model that will generate very high quality images. It is one of the few models capable to generate in the image very long texts.",
|
||||
"URLs": [
|
||||
"https://huggingface.co/DeepBeepMeep/Qwen_image/resolve/main/qwen_image_20B_bf16.safetensors",
|
||||
"https://huggingface.co/DeepBeepMeep/Qwen_image/resolve/main/qwen_image_20B_quanto_bf16_int8.safetensors"
|
||||
],
|
||||
"xresolutions": [ ["1328x1328 (1:1)", "1328x1328"],
|
||||
["1664x928 (16:9)", "1664x928"],
|
||||
["928x1664 (9:16)", "928x1664"],
|
||||
["1472x1140 (4:3)", "1472x1140"],
|
||||
["1140x1472 (3:4)", "1140x1472"]],
|
||||
"attention": {"<89" : "sdpa"},
|
||||
"image_outputs": true
|
||||
},
|
||||
"prompt": "draw a hat",
|
||||
"resolution": "1280x720",
|
||||
"batch_size": 1
|
||||
}
|
||||
19
defaults/qwen_image_edit_20B.json
Normal file
@ -0,0 +1,19 @@
|
||||
{
|
||||
"model": {
|
||||
"name": "Qwen Image Edit 20B",
|
||||
"architecture": "qwen_image_edit_20B",
|
||||
"description": "Qwen Image Edit is a generative model that can generate very high quality images with long texts in it. Best results will be at 720p. Use it to edit a Subject or combine multiple Subjects. ",
|
||||
"URLs": [
|
||||
"https://huggingface.co/DeepBeepMeep/Qwen_image/resolve/main/qwen_image_edit_20B_bf16.safetensors",
|
||||
"https://huggingface.co/DeepBeepMeep/Qwen_image/resolve/main/qwen_image_edit_20B_quanto_bf16_int8.safetensors"
|
||||
],
|
||||
"attention": {
|
||||
"<89": "sdpa"
|
||||
},
|
||||
"reference_image": true,
|
||||
"image_outputs": true
|
||||
},
|
||||
"prompt": "add a hat",
|
||||
"resolution": "1280x720",
|
||||
"batch_size": 1
|
||||
}
|
||||
10
defaults/standin.json
Normal file
@ -0,0 +1,10 @@
|
||||
{
|
||||
"model":
|
||||
{
|
||||
"name": "Wan2.1 Standin 14B",
|
||||
"modules": [ ["https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/Stand-In_wan2.1_T2V_14B_ver1.0_bf16.safetensors"]],
|
||||
"architecture" : "standin",
|
||||
"description": "The original Wan Text 2 Video model combined with the StandIn module to improve Identity Preservation. You need to provide a Reference Image with white background which is a close up of person face to transfer this person in the Video.",
|
||||
"URLs": "t2v"
|
||||
}
|
||||
}
|
||||
@ -1,13 +0,0 @@
|
||||
{
|
||||
"model": {
|
||||
"name": "Wan2.1 text2image 14B",
|
||||
"architecture": "t2v",
|
||||
"description": "The original Wan Text 2 Video model configured to generate an image instead of a video.",
|
||||
"image_outputs": true,
|
||||
"URLs": "t2v"
|
||||
},
|
||||
"video_length": 1,
|
||||
"resolution": "1280x720"
|
||||
}
|
||||
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
{
|
||||
"model":
|
||||
{
|
||||
"name": "Wan2.1 text2video 14B",
|
||||
"name": "Wan2.1 Text2video 14B",
|
||||
"architecture" : "t2v",
|
||||
"description": "The original Wan Text 2 Video model. Most other models have been built on top of it",
|
||||
"URLs": [
|
||||
|
||||
@ -1,11 +1,11 @@
|
||||
{
|
||||
"model":
|
||||
{
|
||||
"name": "Wan2.1 text2video 1.3B",
|
||||
"name": "Wan2.1 Text2video 1.3B",
|
||||
"architecture" : "t2v_1.3B",
|
||||
"description": "The light version of the original Wan Text 2 Video model. Most other models have been built on top of it",
|
||||
"URLs": [
|
||||
"https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/wan2.1_text2video_1.3B_bf16.safetensors"
|
||||
"https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/wan2.1_text2video_1.3B_mbf16.safetensors"
|
||||
]
|
||||
}
|
||||
}
|
||||
25
defaults/t2v_2_2.json
Normal file
@ -0,0 +1,25 @@
|
||||
{
|
||||
"model":
|
||||
{
|
||||
"name": "Wan2.2 Text2video 14B",
|
||||
"architecture" : "t2v",
|
||||
"description": "Wan 2.2 Text 2 Video model",
|
||||
"URLs": [
|
||||
"https://huggingface.co/DeepBeepMeep/Wan2.2/resolve/main/wan2.2_text2video_14B_high_mbf16.safetensors",
|
||||
"https://huggingface.co/DeepBeepMeep/Wan2.2/resolve/main/wan2.2_text2video_14B_high_quanto_mbf16_int8.safetensors",
|
||||
"https://huggingface.co/DeepBeepMeep/Wan2.2/resolve/main/wan2.2_text2video_14B_high_quanto_mfp16_int8.safetensors"
|
||||
],
|
||||
"URLs2": [
|
||||
"https://huggingface.co/DeepBeepMeep/Wan2.2/resolve/main/wan2.2_text2video_14B_low_mbf16.safetensors",
|
||||
"https://huggingface.co/DeepBeepMeep/Wan2.2/resolve/main/wan2.2_text2video_14B_low_quanto_mbf16_int8.safetensors",
|
||||
"https://huggingface.co/DeepBeepMeep/Wan2.2/resolve/main/wan2.2_text2video_14B_low_quanto_mfp16_int8.safetensors"
|
||||
],
|
||||
"group": "wan2_2"
|
||||
},
|
||||
"guidance_phases": 2,
|
||||
"switch_threshold" : 875,
|
||||
"guidance_scale" : 4,
|
||||
"guidance2_scale" : 3,
|
||||
"flow_shift" : 12
|
||||
|
||||
}
|
||||
@ -1,7 +1,7 @@
|
||||
{
|
||||
"model":
|
||||
{
|
||||
"name": "Wan2.1 text2video FusioniX 14B",
|
||||
"name": "Wan2.1 Text2video FusioniX 14B",
|
||||
"architecture" : "t2v",
|
||||
"description": "A powerful merged text-to-video model based on the original WAN 2.1 T2V model, enhanced using multiple open-source components and LoRAs to boost motion realism, temporal consistency, and expressive detail.",
|
||||
"URLs": [
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
{
|
||||
"model": {
|
||||
"name": "Wan2.1 text2video Self-Forcing 14B",
|
||||
"name": "Wan2.1 Text2video Self-Forcing 14B",
|
||||
"architecture": "t2v",
|
||||
"description": "This model is an advanced text-to-video generation model. This approach allows the model to generate videos with significantly fewer inference steps (4 or 8 steps) and without classifier-free guidance, substantially reducing video generation time while maintaining high quality outputs.",
|
||||
"URLs": [
|
||||
|
||||
17
defaults/ti2v_2_2.json
Normal file
@ -0,0 +1,17 @@
|
||||
{
|
||||
"model": {
|
||||
"name": "Wan2.2 TextImage2video 5B",
|
||||
"architecture": "ti2v_2_2",
|
||||
"description": "Wan 2.2 Text 2 Video model 5B",
|
||||
"URLs": [
|
||||
"https://huggingface.co/DeepBeepMeep/Wan2.2/resolve/main/wan2.2_text2video_5B_mbf16.safetensors",
|
||||
"https://huggingface.co/DeepBeepMeep/Wan2.2/resolve/main/wan2.2_text2video_5B_quanto_mbf16_int8.safetensors"
|
||||
],
|
||||
"group": "wan2_2"
|
||||
},
|
||||
"video_length": 121,
|
||||
"guidance_scale": 5,
|
||||
"flow_shift": 5,
|
||||
"num_inference_steps": 50,
|
||||
"resolution": "1280x720"
|
||||
}
|
||||
15
defaults/ti2v_2_2_fastwan.json
Normal file
@ -0,0 +1,15 @@
|
||||
{
|
||||
"model": {
|
||||
"name": "Wan2.2 FastWan TextImage2video 5B",
|
||||
"architecture": "ti2v_2_2",
|
||||
"description": "FastWan2.2-TI2V-5B-Full-Diffusers is built upon Wan-AI/Wan2.2-TI2V-5B-Diffusers. It supports efficient 3-step inference and produces high-quality videos at 121×704×1280 resolution",
|
||||
"URLs": "ti2v_2_2",
|
||||
"loras": ["https://huggingface.co/DeepBeepMeep/Wan2.2/resolve/main/loras_accelerators/Wan2_2_5B_FastWanFullAttn_lora_rank_128_bf16.safetensors"],
|
||||
"group": "wan2_2"
|
||||
},
|
||||
"video_length": 121,
|
||||
"guidance_scale": 1,
|
||||
"flow_shift": 3,
|
||||
"num_inference_steps": 3,
|
||||
"resolution": "1280x720"
|
||||
}
|
||||
@ -3,9 +3,10 @@
|
||||
{
|
||||
"name": "Vace ControlNet 1.3B",
|
||||
"architecture" : "vace_1.3B",
|
||||
"description": "The Vace ControlNet model is a powerful model that allows you to control the content of the generated video based of additional custom data : pose or depth video, images or objects you want to see in the video.",
|
||||
"URLs": [
|
||||
"https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/wan2.1_Vace_1.3B_mbf16.safetensors"
|
||||
]
|
||||
"modules": [
|
||||
["https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/wan2.1_Vace_1_3B_module.safetensors"]
|
||||
],
|
||||
"description": "The Vace ControlNet model is a powerful model that allows you to control the content of the generated video based of additional custom data : pose or depth video, images or objects you want to see in the video.",
|
||||
"URLs": "t2v_1.3B"
|
||||
}
|
||||
}
|
||||
@ -3,7 +3,9 @@
|
||||
"name": "Vace ControlNet 14B",
|
||||
"architecture": "vace_14B",
|
||||
"modules": [
|
||||
"vace_14B"
|
||||
["https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/wan2.1_Vace_14B_module_mbf16.safetensors",
|
||||
"https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/wan2.1_Vace_14B_module_quanto_mbf16_int8.safetensors",
|
||||
"https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/wan2.1_Vace_14B_module_quanto_mfp16_int8.safetensors"]
|
||||
],
|
||||
"description": "The Vace ControlNet model is a powerful model that allows you to control the content of the generated video based of additional custom data : pose or depth video, images or objects you want to see in the video.",
|
||||
"URLs": "t2v"
|
||||
|
||||
21
defaults/vace_14B_cocktail.json
Normal file
@ -0,0 +1,21 @@
|
||||
{
|
||||
"model": {
|
||||
"name": "Vace Cocktail 14B",
|
||||
"architecture": "vace_14B",
|
||||
"modules": [
|
||||
"vace_14B"
|
||||
],
|
||||
"description": "This model has been created on the fly using the Wan text 2 video model and the Loras of FusioniX. The weight of the Detail Enhancer Lora has been reduced to improve identity preservation. Copy the model def in the finetune folder to change the Cocktail composition.",
|
||||
"URLs": "t2v",
|
||||
"loras": [
|
||||
"https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/loras_accelerators/Wan21_CausVid_14B_T2V_lora_rank32_v2.safetensors",
|
||||
"https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/loras_accelerators/DetailEnhancerV1.safetensors",
|
||||
"https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/loras_accelerators/Wan21_AccVid_T2V_14B_lora_rank32_fp16.safetensors",
|
||||
"https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/loras_accelerators/Wan21_T2V_14B_MoviiGen_lora_rank32_fp16.safetensors"
|
||||
],
|
||||
"loras_multipliers": [1, 0.5, 0.5, 0.5]
|
||||
},
|
||||
"num_inference_steps": 10,
|
||||
"guidance_scale": 1,
|
||||
"flow_shift": 2
|
||||
}
|
||||
26
defaults/vace_14B_cocktail_2_2.json
Normal file
@ -0,0 +1,26 @@
|
||||
{
|
||||
"model": {
|
||||
"name": "Wan2.2 Vace Experimental Cocktail 14B",
|
||||
"architecture": "vace_14B",
|
||||
"modules": [
|
||||
"vace_14B"
|
||||
],
|
||||
"description": "This model has been created on the fly using the Wan text 2.2 video model and the Loras of FusioniX. The weight of the Detail Enhancer Lora has been reduced to improve identity preservation. There is so far only PARTIAL support of Vace 2.1 which is currently used.",
|
||||
"URLs": "t2v_2_2",
|
||||
"URLs2": "t2v_2_2",
|
||||
"loras": [
|
||||
"https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/loras_accelerators/Wan21_CausVid_14B_T2V_lora_rank32_v2.safetensors",
|
||||
"https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/loras_accelerators/DetailEnhancerV1.safetensors",
|
||||
"https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/loras_accelerators/Wan21_AccVid_T2V_14B_lora_rank32_fp16.safetensors",
|
||||
"https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/loras_accelerators/Wan21_T2V_14B_MoviiGen_lora_rank32_fp16.safetensors"
|
||||
],
|
||||
"loras_multipliers": [1, 0.2, 0.5, 0.5],
|
||||
"group": "wan2_2"
|
||||
},
|
||||
"guidance_phases": 2,
|
||||
"num_inference_steps": 10,
|
||||
"guidance_scale": 1,
|
||||
"guidance2_scale": 1,
|
||||
"flow_shift": 2,
|
||||
"switch_threshold" : 875
|
||||
}
|
||||
@ -1,16 +0,0 @@
|
||||
{
|
||||
"model": {
|
||||
"name": "Vace FusioniX image2image 14B",
|
||||
"architecture": "vace_14B",
|
||||
"modules": [
|
||||
"vace_14B"
|
||||
],
|
||||
"image_outputs": true,
|
||||
"description": "Vace control model enhanced using multiple open-source components and LoRAs to boost motion realism, temporal consistency, and expressive detail.",
|
||||
"URLs": "t2v_fusionix"
|
||||
},
|
||||
"resolution": "1280x720",
|
||||
"guidance_scale": 1,
|
||||
"num_inference_steps": 10,
|
||||
"video_length": 1
|
||||
}
|
||||
29
defaults/vace_14B_lightning_3p_2_2.json
Normal file
@ -0,0 +1,29 @@
|
||||
{
|
||||
"model": {
|
||||
"name": "Wan2.2 Vace Lightning 3 Phases 14B",
|
||||
"architecture": "vace_14B",
|
||||
"modules": [
|
||||
"vace_14B"
|
||||
],
|
||||
"description": "This finetune uses the Lightning 4 steps Loras Accelerator for Wan 2.2 but extend them to 8 steps in order to insert a CFG phase before the 2 accelerated phases with no Guidance. The ultimate goal is reduce the slow motion effect of these Loras Accelerators.",
|
||||
"URLs": "t2v_2_2",
|
||||
"URLs2": "t2v_2_2",
|
||||
"loras": [
|
||||
"https://huggingface.co/DeepBeepMeep/Wan2.2/resolve/main/loras_accelerators/Wan2.2-Lightning_T2V-v1.1-A14B-4steps-lora_HIGH_fp16.safetensors",
|
||||
"https://huggingface.co/DeepBeepMeep/Wan2.2/resolve/main/loras_accelerators/Wan2.2-Lightning_T2V-v1.1-A14B-4steps-lora_LOW_fp16.safetensors"
|
||||
],
|
||||
"loras_multipliers": ["0;1;0", "0;0;1"],
|
||||
"lock_guidance_phases": true,
|
||||
"group": "wan2_2"
|
||||
},
|
||||
"num_inference_steps": 8,
|
||||
"guidance_phases": 3,
|
||||
"guidance_scale": 3.5,
|
||||
"guidance2_scale": 1,
|
||||
"guidance3_scale": 1,
|
||||
"switch_threshold": 965,
|
||||
"switch_threshold2": 800,
|
||||
"model_switch_phase": 2,
|
||||
"flow_shift": 3,
|
||||
"sample_solver": "euler"
|
||||
}
|
||||
9
defaults/vace_standin_14B.json
Normal file
@ -0,0 +1,9 @@
|
||||
{
|
||||
"model": {
|
||||
"name": "Vace Standin 14B",
|
||||
"architecture": "vace_standin_14B",
|
||||
"modules": [ "vace_14B", "standin"],
|
||||
"description": "The Vace ControlNet model is a powerful model that allows you to control the content of the generated video based of additional custom data : pose or depth video, images or objects you want to see in the video.",
|
||||
"URLs": "t2v"
|
||||
}
|
||||
}
|
||||
@ -1,16 +1,106 @@
|
||||
# Changelog
|
||||
|
||||
## 🔥 Latest News
|
||||
### July 21 2025: WanGP v7.1
|
||||
- Flux Family Reunion : *Flux Dev* and *Flux Schnell* have been invited aboard WanGP. To celebrate that, Loras support for the Flux *diffusers* format has also been added.
|
||||
|
||||
- LTX Video upgraded to version 0.9.8: you can now generate 1800 frames (1 min of video !) in one go without a sliding window. With the distilled model it will take only 5 minutes with a RTX 4090 (you will need 22 GB of VRAM though). I have added options to select higher humber frames if you want to experiment
|
||||
|
||||
- LTX Video ControlNet : it is a Control Net that allows you for instance to transfer a Human motion or Depth from a control video. It is not as powerful as Vace but can produce interesting things especially as now you can generate quickly a 1 min video. Under the scene IC-Loras (see below) for Pose, Depth and Canny are automatically loaded for you, no need to add them.
|
||||
|
||||
- LTX IC-Lora support: these are special Loras that consumes a conditional image or video
|
||||
Beside the pose, depth and canny IC-Loras transparently loaded there is the *detailer* (https://huggingface.co/Lightricks/LTX-Video-ICLoRA-detailer-13b-0.9.8) which is basically an upsampler. Add the *detailer* as a Lora and use LTX Raw Format as control net choice to use it.
|
||||
|
||||
And Also:
|
||||
- easier way to select video resolution
|
||||
- started to optimize Matanyone to reduce VRAM requirements
|
||||
|
||||
|
||||
### July 15 2025: WanGP v7.0 is an AI Powered Photoshop
|
||||
This release turns the Wan models into Image Generators. This goes way more than allowing to generate a video made of single frame :
|
||||
- Multiple Images generated at the same time so that you can choose the one you like best.It is Highly VRAM optimized so that you can generate for instance 4 720p Images at the same time with less than 10 GB
|
||||
- With the *image2image* the original text2video WanGP becomes an image upsampler / restorer
|
||||
- *Vace image2image* comes out of the box with image outpainting, person / object replacement, ...
|
||||
- You can use in one click a newly Image generated as Start Image or Reference Image for a Video generation
|
||||
|
||||
And to complete the full suite of AI Image Generators, Ladies and Gentlemen please welcome for the first time in WanGP : **Flux Kontext**.\
|
||||
As a reminder Flux Kontext is an image editor : give it an image and a prompt and it will do the change for you.\
|
||||
This highly optimized version of Flux Kontext will make you feel that you have been cheated all this time as WanGP Flux Kontext requires only 8 GB of VRAM to generate 4 images at the same time with no need for quantization.
|
||||
|
||||
WanGP v7 comes with *Image2image* vanilla and *Vace FusinoniX*. However you can build your own finetune where you will combine a text2video or Vace model with any combination of Loras.
|
||||
|
||||
Also in the news:
|
||||
- You can now enter the *Bbox* for each speaker in *Multitalk* to precisely locate who is speaking. And to save some headaches the *Image Mask generator* will give you the *Bbox* coordinates of an area you have selected.
|
||||
- *Film Grain* post processing to add a vintage look at your video
|
||||
- *First Last Frame to Video* model should work much better now as I have discovered rencently its implementation was not complete
|
||||
- More power for the finetuners, you can now embed Loras directly in the finetune definition. You can also override the default models (titles, visibility, ...) with your own finetunes. Check the doc that has been updated.
|
||||
|
||||
|
||||
### July 10 2025: WanGP v6.7, is NAG a game changer ? you tell me
|
||||
Maybe you knew that already but most *Loras accelerators* we use today (Causvid, FusioniX) don't use *Guidance* at all (that it is *CFG* is set to 1). This helps to get much faster generations but the downside is that *Negative Prompts* are completely ignored (including the default ones set by the models). **NAG** (https://github.com/ChenDarYen/Normalized-Attention-Guidance) aims to solve that by injecting the *Negative Prompt* during the *attention* processing phase.
|
||||
|
||||
So WanGP 6.7 gives you NAG, but not any NAG, a *Low VRAM* implementation, the default one ends being VRAM greedy. You will find NAG in the *General* advanced tab for most Wan models.
|
||||
|
||||
Use NAG especially when Guidance is set to 1. To turn it on set the **NAG scale** to something around 10. There are other NAG parameters **NAG tau** and **NAG alpha** which I recommend to change only if you don't get good results by just playing with the NAG scale. Don't hesitate to share on this discord server the best combinations for these 3 parameters.
|
||||
|
||||
The authors of NAG claim that NAG can also be used when using a Guidance (CFG > 1) and to improve the prompt adherence.
|
||||
|
||||
### July 8 2025: WanGP v6.6, WanGP offers you **Vace Multitalk Dual Voices Fusionix Infinite** :
|
||||
**Vace** our beloved super Control Net has been combined with **Multitalk** the new king in town that can animate up to two people speaking (**Dual Voices**). It is accelerated by the **Fusionix** model and thanks to *Sliding Windows* support and *Adaptive Projected Guidance* (much slower but should reduce the reddish effect with long videos) your two people will be able to talk for very a long time (which is an **Infinite** amount of time in the field of video generation).
|
||||
|
||||
Of course you will get as well *Multitalk* vanilla and also *Multitalk 720p* as a bonus.
|
||||
|
||||
And since I am mister nice guy I have enclosed as an exclusivity an *Audio Separator* that will save you time to isolate each voice when using Multitalk with two people.
|
||||
|
||||
As I feel like resting a bit I haven't produced yet a nice sample Video to illustrate all these new capabilities. But here is the thing, I ams sure you will publish in the *Share Your Best Video* channel your *Master Pieces*. The best ones will be added to the *Announcements Channel* and will bring eternal fame to its authors.
|
||||
|
||||
But wait, there is more:
|
||||
- Sliding Windows support has been added anywhere with Wan models, so imagine with text2video recently upgraded in 6.5 into a video2video, you can now upsample very long videos regardless of your VRAM. The good old image2video model can now reuse the last image to produce new videos (as requested by many of you)
|
||||
- I have added also the capability to transfer the audio of the original control video (Misc. advanced tab) and an option to preserve the fps into the generated video, so from now on you will be to upsample / restore your old families video and keep the audio at their original pace. Be aware that the duration will be limited to 1000 frames as I still need to add streaming support for unlimited video sizes.
|
||||
|
||||
Also, of interest too:
|
||||
- Extract video info from Videos that have not been generated by WanGP, even better you can also apply post processing (Upsampling / MMAudio) on non WanGP videos
|
||||
- Force the generated video fps to your liking, works wery well with Vace when using a Control Video
|
||||
- Ability to chain URLs of Finetune models (for instance put the URLs of a model in your main finetune and reference this finetune in other finetune models to save time)
|
||||
|
||||
### July 2 2025: WanGP v6.5.1, WanGP takes care of you: lots of quality of life features:
|
||||
- View directly inside WanGP the properties (seed, resolutions, length, most settings...) of the past generations
|
||||
- In one click use the newly generated video as a Control Video or Source Video to be continued
|
||||
- Manage multiple settings for the same model and switch between them using a dropdown box
|
||||
- WanGP will keep the last generated videos in the Gallery and will remember the last model you used if you restart the app but kept the Web page open
|
||||
- Custom resolutions : add a file in the WanGP folder with the list of resolutions you want to see in WanGP (look at the instruction readme in this folder)
|
||||
|
||||
Taking care of your life is not enough, you want new stuff to play with ?
|
||||
- MMAudio directly inside WanGP : add an audio soundtrack that matches the content of your video. By the way it is a low VRAM MMAudio and 6 GB of VRAM should be sufficient. You will need to go in the *Extensions* tab of the WanGP *Configuration* to enable MMAudio
|
||||
- Forgot to upsample your video during the generation ? want to try another MMAudio variation ? Fear not you can also apply upsampling or add an MMAudio track once the video generation is done. Even better you can ask WangGP for multiple variations of MMAudio to pick the one you like best
|
||||
- MagCache support: a new step skipping approach, supposed to be better than TeaCache. Makes a difference if you usually generate with a high number of steps
|
||||
- SageAttention2++ support : not just the compatibility but also a slightly reduced VRAM usage
|
||||
- Video2Video in Wan Text2Video : this is the paradox, a text2video can become a video2video if you start the denoising process later on an existing video
|
||||
- FusioniX upsampler: this is an illustration of Video2Video in Text2Video. Use the FusioniX text2video model with an output resolution of 1080p and a denoising strength of 0.25 and you will get one of the best upsamplers (in only 2/3 steps, you will need lots of VRAM though). Increase the denoising strength and you will get one of the best Video Restorer
|
||||
- Choice of Wan Samplers / Schedulers
|
||||
- More Lora formats support
|
||||
|
||||
**If you had upgraded to v6.5 please upgrade again to 6.5.1 as this will fix a bug that ignored Loras beyond the first one**
|
||||
|
||||
### June 23 2025: WanGP v6.3, Vace Unleashed. Thought we couldnt squeeze Vace even more ?
|
||||
- Multithreaded preprocessing when possible for faster generations
|
||||
- Multithreaded frames Lanczos Upsampling as a bonus
|
||||
- A new Vace preprocessor : *Flow* to extract fluid motion
|
||||
- Multi Vace Controlnets: you can now transfer several properties at the same time. This opens new possibilities to explore, for instance if you transfer *Human Movement* and *Shapes* at the same time for some reasons the lighting of your character will take into account much more the environment of your character.
|
||||
- Injected Frames Outpainting, in case you missed it in WanGP 6.21
|
||||
|
||||
Don't know how to use all of the Vace features ? Check the Vace Guide embedded in WanGP as it has also been updated.
|
||||
|
||||
|
||||
### June 19 2025: WanGP v6.2, Vace even more Powercharged
|
||||
Have I told you that I am a big fan of Vace ? Here are more goodies to unleash its power:
|
||||
👋 Have I told you that I am a big fan of Vace ? Here are more goodies to unleash its power:
|
||||
- If you ever wanted to watch Star Wars in 4:3, just use the new *Outpainting* feature and it will add the missing bits of image at the top and the bottom of the screen. The best thing is *Outpainting* can be combined with all the other Vace modifications, for instance you can change the main character of your favorite movie at the same time
|
||||
- More processing can combined at the same time (for instance the depth process can be applied outside the mask)
|
||||
- Upgraded the depth extractor to Depth Anything 2 which is much more detailed
|
||||
|
||||
As a bonus, I have added two finetunes based on the Safe-Forcing technology (which requires only 4 steps to generate a video): Wan 2.1 text2video Self-Forcing and Vace Self-Forcing. I know there is Lora around but the quality of the Lora is worse (at least with Vace) compared to the full model. Don't hesitate to share your opinion about this on the discord server.
|
||||
|
||||
### June 17 2025: WanGP v6.1, Vace Powercharged
|
||||
Lots of improvements for Vace the Mother of all Models:
|
||||
👋 Lots of improvements for Vace the Mother of all Models:
|
||||
- masks can now be combined with on the fly processing of a control video, for instance you can extract the motion of a specific person defined by a mask
|
||||
- on the fly modification of masks : reversed masks (with the same mask you can modify the background instead of the people covered by the masks), enlarged masks (you can cover more area if for instance the person you are trying to inject is larger than the one in the mask), ...
|
||||
- view these modified masks directly inside WanGP during the video generation to check they are really as expected
|
||||
@ -37,22 +127,6 @@ You get **Vace FusioniX**: the Ultimate Vace Model, Fast (10 steps, no need for
|
||||
|
||||
Check the *Finetune Guide* to create finetune models definitions and share them on the WanGP discord server.
|
||||
|
||||
### June 12 2025: WanGP v5.6
|
||||
👋 *Finetune models*: You find the 20 models supported by WanGP not sufficient ? Too impatient to wait for the next release to get the support for a newly released model ? Your prayers have been answered: if a new model is compatible with a model architecture supported by WanGP, you can add by yourself the support for this model in WanGP by just creating a finetune model definition. You can then store this model in the cloud (for instance in Huggingface) and the very light finetune definition file can be easily shared with other users. WanGP will download automatically the finetuned model for them.
|
||||
|
||||
To celebrate the new finetunes support, here are a few finetune gifts (directly accessible from the model selection menu):
|
||||
- *Fast Hunyuan Video* : generate model t2v in only 6 steps
|
||||
- *Hunyuan Vido AccVideo* : generate model t2v in only 5 steps
|
||||
- *Wan FusioniX*: it is a combo of AccVideo / CausVid ans other models and can generate high quality Wan videos in only 8 steps
|
||||
|
||||
One more thing...
|
||||
|
||||
The new finetune system can be used to combine complementaty models : what happens when you combine Fusionix Text2Video and Vace Control Net ?
|
||||
|
||||
You get **Vace FusioniX**: the Ultimate Vace Model, Fast (10 steps, no need for guidance) and with a much better quality Video than the original slower model (despite being the best Control Net out there). Here goes one more finetune...
|
||||
|
||||
Check the *Finetune Guide* to create finetune models definitions and share them on the WanGP discord server.
|
||||
|
||||
### June 11 2025: WanGP v5.5
|
||||
👋 *Hunyuan Video Custom Audio*: it is similar to Hunyuan Video Avatar excpet there isn't any lower limit on the number of frames and you can use your reference images in a different context than the image itself\
|
||||
*Hunyuan Video Custom Edit*: Hunyuan Video Controlnet, use it to do inpainting and replace a person in a video while still keeping his poses. Similar to Vace but less restricted than the Wan models in terms of content...
|
||||
|
||||
@ -55,16 +55,39 @@ For instance if one adds a module *vace_14B* on top of a model with architecture
|
||||
- *architecture* : architecture Id of the base model of the finetune (see previous section)
|
||||
- *description*: description of the finetune that will appear at the top
|
||||
- *URLs*: URLs of all the finetune versions (quantized / non quantized). WanGP will pick the version that is the closest to the user preferences. You will need to follow a naming convention to help WanGP identify the content of each version (see next section). Right now WanGP supports only 8 bits quantized model that have been quantized using **quanto**. WanGP offers a command switch to build easily such a quantized model (see below). *URLs* can contain also paths to local file to allow testing.
|
||||
- *URLs2*: URLs of all the finetune versions (quantized / non quantized) of the weights used for the second phase of a model. For instance with Wan 2.2, the first phase contains the High Noise model weights and the second phase contains the Low Noise model weights. This feature can be used with other models than Wan 2.2 to combine different model weights during the same video generation.
|
||||
- *modules*: this a list of modules to be combined with the models referenced by the URLs. A module is a model extension that is merged with a model to expand its capabilities. Supported models so far are : *vace_14B* and *multitalk*. For instance the full Vace model is the fusion of a Wan text 2 video and the Vace module.
|
||||
- *preload_URLs* : URLs of files to download no matter what (used to load quantization maps for instance)
|
||||
-*loras* : URLs of Loras that will applied before any other Lora specified by the user. These loras will be quite often Loras accelerator. For instance if you specified here the FusioniX Lora you will be able to reduce the number of generation steps to -*loras_multipliers* : a list of float numbers that defines the weight of each Lora mentioned above.
|
||||
-*loras* : URLs of Loras that will applied before any other Lora specified by the user. These loras will be quite often Loras accelerators. For instance if you specify here the FusioniX Lora you will be able to reduce the number of generation steps to 10
|
||||
-*loras_multipliers* : a list of float numbers or strings that defines the weight of each Lora mentioned in *Loras*. The string syntax is used if you want your lora multiplier to change over the steps (please check the Loras doc) or if you want a multiplier to be applied on a specific High Noise phase or Low Noise phase of a Wan 2.2 model. For instance, here the multiplier will be only applied during the High Noise phase and for half of the steps of this phase the multiplier will be 1 and for the other half 1.1.
|
||||
```
|
||||
"loras" : [ "my_lora.safetensors"],
|
||||
"loras_multipliers" : [ "1,1.1;0"]
|
||||
```
|
||||
|
||||
- *auto_quantize*: if set to True and no quantized model URL is provided, WanGP will perform on the fly quantization if the user expects a quantized model
|
||||
-*visible* : by default assumed to be true. If set to false the model will no longer be visible. This can be useful if you create a finetune to override a default model and hide it.
|
||||
-*image_outputs* : turn any model that generates a video into a model that generates images. In fact it will adapt the user interface for image generation and ask the model to generate a video with a single frame.
|
||||
|
||||
In order to favor reusability the properties of *URLs*, *modules*, *loras* and *preload_URLs* can contain instead of a list of URLs a single text which corresponds to the id of a finetune or default model to reuse.
|
||||
In order to favor reusability the properties of *URLs*, *modules*, *loras* and *preload_URLs* can contain instead of a list of URLs a single text which corresponds to the id of a finetune or default model to reuse. Instead of:
|
||||
```
|
||||
"URLs": [
|
||||
"https://huggingface.co/DeepBeepMeep/Wan2.2/resolve/main/wan2.2_text2video_14B_high_mbf16.safetensors",
|
||||
"https://huggingface.co/DeepBeepMeep/Wan2.2/resolve/main/wan2.2_text2video_14B_high_quanto_mbf16_int8.safetensors",
|
||||
"https://huggingface.co/DeepBeepMeep/Wan2.2/resolve/main/wan2.2_text2video_14B_high_quanto_mfp16_int8.safetensors"
|
||||
],
|
||||
"URLs2": [
|
||||
"https://huggingface.co/DeepBeepMeep/Wan2.2/resolve/main/wan2.2_text2video_14B_low_mbf16.safetensors",
|
||||
"https://huggingface.co/DeepBeepMeep/Wan2.2/resolve/main/wan2.2_text2video_14B_low_quanto_mbf16_int8.safetensors",
|
||||
"https://huggingface.co/DeepBeepMeep/Wan2.2/resolve/main/wan2.2_text2video_14B_low_quanto_mfp16_int8.safetensors"
|
||||
],
|
||||
```
|
||||
You can write:
|
||||
```
|
||||
"URLs": "t2v_2_2",
|
||||
"URLs2": "t2v_2_2",
|
||||
```
|
||||
|
||||
For example let’s say you have defined a *t2v_fusionix.json* file which contains the URLs to download the finetune. In the *vace_fusionix.json* you can write « URLs » : « fusionix » to reuse automatically the URLS already defined in the correspond file.
|
||||
|
||||
Example of **model** subtree
|
||||
```
|
||||
|
||||
@ -8,9 +8,9 @@ This guide covers installation for different GPU generations and operating syste
|
||||
- Conda or Python venv
|
||||
- Compatible GPU (RTX 10XX or newer recommended)
|
||||
|
||||
## Installation for RTX 10XX to RTX 40XX (Stable)
|
||||
## Installation for RTX 10XX to RTX 50XX (Stable)
|
||||
|
||||
This installation uses PyTorch 2.6.0 which is well-tested and stable.
|
||||
This installation uses PyTorch 2.7.0 which is well-tested and stable.
|
||||
|
||||
### Step 1: Download and Setup Environment
|
||||
|
||||
@ -27,8 +27,8 @@ conda activate wan2gp
|
||||
### Step 2: Install PyTorch
|
||||
|
||||
```shell
|
||||
# Install PyTorch 2.6.0 with CUDA 12.4
|
||||
pip install torch==2.6.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/test/cu124
|
||||
# Install PyTorch 2.7.0 with CUDA 12.4
|
||||
pip install torch==2.7.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/test/cu128
|
||||
```
|
||||
|
||||
### Step 3: Install Dependencies
|
||||
@ -40,7 +40,7 @@ pip install -r requirements.txt
|
||||
|
||||
### Step 4: Optional Performance Optimizations
|
||||
|
||||
#### Sage Attention (30% faster)
|
||||
#### Sage Attention (30% faster), don't install with RTX 50xx as it is not compatible
|
||||
|
||||
```shell
|
||||
# Windows only: Install Triton
|
||||
@ -58,6 +58,7 @@ pip install triton-windows
|
||||
pip install https://github.com/woct0rdho/SageAttention/releases/download/v2.1.1-windows/sageattention-2.1.1+cu126torch2.6.0-cp310-cp310-win_amd64.whl
|
||||
|
||||
# Linux (manual compilation required)
|
||||
python -m pip install "setuptools<=75.8.2" --force-reinstall
|
||||
git clone https://github.com/thu-ml/SageAttention
|
||||
cd SageAttention
|
||||
pip install -e .
|
||||
@ -70,61 +71,7 @@ pip install -e .
|
||||
pip install flash-attn==2.7.2.post1
|
||||
```
|
||||
|
||||
## Installation for RTX 50XX (Beta)
|
||||
|
||||
RTX 50XX GPUs require PyTorch 2.7.0 (beta). This version may be less stable.
|
||||
|
||||
⚠️ **Important:** Use Python 3.10 for compatibility with pip wheels.
|
||||
|
||||
### Step 1: Setup Environment
|
||||
|
||||
```shell
|
||||
# Clone and setup (same as above)
|
||||
git clone https://github.com/deepbeepmeep/Wan2GP.git
|
||||
cd Wan2GP
|
||||
conda create -n wan2gp python=3.10.9
|
||||
conda activate wan2gp
|
||||
```
|
||||
|
||||
### Step 2: Install PyTorch Beta
|
||||
|
||||
```shell
|
||||
# Install PyTorch 2.7.0 with CUDA 12.8
|
||||
pip install torch==2.7.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/test/cu128
|
||||
```
|
||||
|
||||
### Step 3: Install Dependencies
|
||||
|
||||
```shell
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
### Step 4: Optional Optimizations for RTX 50XX
|
||||
|
||||
#### Sage Attention
|
||||
|
||||
```shell
|
||||
# Windows
|
||||
pip install triton-windows
|
||||
pip install sageattention==1.0.6
|
||||
|
||||
# Linux
|
||||
pip install sageattention==1.0.6
|
||||
```
|
||||
|
||||
#### Sage 2 Attention
|
||||
|
||||
```shell
|
||||
# Windows
|
||||
pip install triton-windows
|
||||
pip install https://github.com/woct0rdho/SageAttention/releases/download/v2.1.1-windows/sageattention-2.1.1+cu128torch2.7.0-cp310-cp310-win_amd64.whl
|
||||
|
||||
# Linux (manual compilation)
|
||||
git clone https://github.com/thu-ml/SageAttention
|
||||
cd SageAttention
|
||||
pip install -e .
|
||||
```
|
||||
|
||||
|
||||
## Attention Modes
|
||||
|
||||
WanGP supports several attention implementations:
|
||||
@ -134,6 +81,12 @@ WanGP supports several attention implementations:
|
||||
- **Sage2**: 40% speed boost
|
||||
- **Flash**: Good performance, may be complex to install on Windows
|
||||
|
||||
### Attention GPU Compatibility
|
||||
|
||||
- RTX 10XX, 20XX: SDPA
|
||||
- RTX 30XX, 40XX: SDPA, Flash Attention, Xformers, Sage, Sage2
|
||||
- RTX 50XX: SDPA, SDPA, Flash Attention, Xformers, Sage2
|
||||
|
||||
## Performance Profiles
|
||||
|
||||
Choose a profile based on your hardware:
|
||||
@ -161,10 +114,5 @@ If Sage attention doesn't work:
|
||||
- Use Profile 4 for lower VRAM usage
|
||||
- Consider using 1.3B models instead of 14B models
|
||||
|
||||
### GPU Compatibility
|
||||
|
||||
- RTX 10XX, 20XX: Supported with SDPA attention
|
||||
- RTX 30XX, 40XX: Full feature support
|
||||
- RTX 50XX: Beta support with PyTorch 2.7.0
|
||||
|
||||
For more troubleshooting, see [TROUBLESHOOTING.md](TROUBLESHOOTING.md)
|
||||
For more troubleshooting, see [TROUBLESHOOTING.md](TROUBLESHOOTING.md)
|
||||
|
||||
102
docs/LORAS.md
@ -7,18 +7,21 @@ Loras (Low-Rank Adaptations) allow you to customize video generation models by a
|
||||
Loras are organized in different folders based on the model they're designed for:
|
||||
|
||||
### Wan Text-to-Video Models
|
||||
- `loras/` - General t2v loras
|
||||
- `loras/` - General t2v loras for Wan 2.1 (t2v only) and for all Wan 2.2 models
|
||||
Optional sub folders:
|
||||
- `loras/1.3B/` - Loras specifically for 1.3B models
|
||||
- `loras/5B/` - Loras specifically for 1.3B models
|
||||
- `loras/14B/` - Loras specifically for 14B models
|
||||
|
||||
### Wan Image-to-Video Models
|
||||
- `loras_i2v/` - Image-to-video loras
|
||||
- `loras_i2v/` - Image-to-video loras for Wan 2.1
|
||||
|
||||
### Other Models
|
||||
- `loras_hunyuan/` - Hunyuan Video t2v loras
|
||||
- `loras_hunyuan_i2v/` - Hunyuan Video i2v loras
|
||||
- `loras_ltxv/` - LTX Video loras
|
||||
- `loras_flux/` - Flux loras
|
||||
- `loras_qwen/` - Qwen loras
|
||||
|
||||
## Custom Lora Directory
|
||||
|
||||
@ -40,7 +43,9 @@ python wgp.py --lora-dir-hunyuan /path/to/hunyuan/loras --lora-dir-ltxv /path/to
|
||||
2. Launch WanGP
|
||||
3. In the Advanced Tab, select the "Loras" section
|
||||
4. Check the loras you want to activate
|
||||
5. Set multipliers for each lora (default is 1.0)
|
||||
5. Set multipliers for each lora (default is 1.0 if multiplier is not mentioned)
|
||||
|
||||
If you store loras in the loras folder once WanGP has been launched, click the *Refresh* button at the top so that it can become selectable.
|
||||
|
||||
### Lora Multipliers
|
||||
|
||||
@ -53,7 +58,7 @@ Multipliers control the strength of each lora's effect:
|
||||
- First lora: 1.2 strength
|
||||
- Second lora: 0.8 strength
|
||||
|
||||
#### Time-based Multipliers
|
||||
#### Time-based and Phase-based Multipliers
|
||||
For dynamic effects over generation steps, use comma-separated values:
|
||||
```
|
||||
0.9,0.8,0.7
|
||||
@ -63,6 +68,55 @@ For dynamic effects over generation steps, use comma-separated values:
|
||||
- First lora: 0.9 → 0.8 → 0.7
|
||||
- Second lora: 1.2 → 1.1 → 1.0
|
||||
|
||||
With models like Wan 2.2 that uses internally two diffusion models (*High noise* / *Low Noise*) you can specify which Loras you want to be applied for a specific phase by separating each phase with a ";".
|
||||
|
||||
For instance, if you want to disable a lora for phase *High Noise* and enables it only for phase *Low Noise*:
|
||||
```
|
||||
0;1
|
||||
```
|
||||
|
||||
Also with Wan 2.2, if you have two loras and you want the first one to be applied only during the High noise and the second one during the Low noise phase:
|
||||
```
|
||||
1;0 0;1
|
||||
```
|
||||
|
||||
As usual, you can use any float for a multiplier and have a multiplier varries throughout one phase for one Lora:
|
||||
```
|
||||
0.9,0.8;1.2,1.1,1
|
||||
```
|
||||
In this example multiplier 0.9 and 0.8 will be used during the *High Noise* phase and 1.2, 1.1 and 1 during the *Low Noise* phase.
|
||||
|
||||
Here is another example for two loras:
|
||||
```
|
||||
0.9,0.8;1.2,1.1,1
|
||||
0.5;0,0.7
|
||||
```
|
||||
|
||||
If one of several of your Lora multipliers are phased based (that is with a ";") and there are also Loras Multipliers that are only time based (don't have a ";" but have a ",") the time only multiplier will ignore the phases. For instance, let's assume we have a 6 steps denoising process in the following example:
|
||||
|
||||
```
|
||||
1;0
|
||||
0;1
|
||||
0.8,0.7,0.5
|
||||
```
|
||||
Here the first lora will be as expected only used with the High Noise model and the second lora only used with the Low noise model. However for the third Lora: for steps 1-2 the multiplier will be (regadless of the phase) 0.8 then for steps 3-4 the multiplier will be 0.7 and finally for steps 5-6 the multiplier will be 0.5
|
||||
|
||||
You can use phased Lora multipliers even if have a single model (that is without any High / Low models) as Lora multiplier phases are aligned with Guidance phases. Let's assume you have defined 3 guidance phases (for instance guidance=3, then guidance=1.5 and at last guidance=1 ):
|
||||
```
|
||||
0;1;0
|
||||
0;0;1
|
||||
```
|
||||
In that case no lora will be applied during the first phase when guidance is 3. Then the fist lora will be only used when guidance is 1.5 and the second lora only when guidance is 1.
|
||||
|
||||
Best of all you can combine 3 guidance phases with High / Low models. Let's take this practical example with *Lightning 4/8 steps loras accelerators for Wan 2.2* where we want to increase the motion by adding some guidance at the very beginning (in that case a first phase that lasts only 1 step should be sufficient):
|
||||
```
|
||||
Guidances: 3.5, 1 and 1
|
||||
Model transition: Phase 2-3
|
||||
Loras Multipliers: 0;1;0 0;0;1
|
||||
```
|
||||
Here during the first phase with guidance 3.5, the High model will be used but there won't be any lora at all. Then during phase 2 only the High lora will be used (which requires to set the guidance to 1). At last in phase 3 WanGP will switch to the Low model and then only the Low lora will be used.
|
||||
|
||||
*Note that the syntax for multipliers can also be used in a Finetune model definition file (except that each multiplier definition is a string in a json list)*
|
||||
## Lora Presets
|
||||
|
||||
Lora Presets are combinations of loras with predefined multipliers and prompts.
|
||||
@ -100,15 +154,22 @@ WanGP supports multiple lora formats:
|
||||
## Loras Accelerators
|
||||
Most Loras are used to apply a specific style or to alter the content of the output of the generated video.
|
||||
However some Loras have been designed to tranform a model into a distilled model which requires fewer steps to generate a video.
|
||||
Loras accelerators usually require to the set the Guidance to 1. Don't forget to do it as not only the quality of the generate video will be bad but it will two times slower.
|
||||
|
||||
You will find most *Loras Accelerators* here:
|
||||
You will find most *Loras Accelerators* below:
|
||||
- Wan 2.1
|
||||
https://huggingface.co/DeepBeepMeep/Wan2.1/tree/main/loras_accelerators
|
||||
- Wan 2.2
|
||||
https://huggingface.co/DeepBeepMeep/Wan2.2/tree/main/loras_accelerators
|
||||
- Qwen:
|
||||
https://huggingface.co/DeepBeepMeep/Qwen_image/tree/main/loras_accelerators
|
||||
|
||||
|
||||
### Setup Instructions
|
||||
1. Download the Lora
|
||||
2. Place it in your `loras/` directory if it is a t2v lora or in the `loras_i2v/` directory if it isa i2v lora
|
||||
|
||||
## FusioniX (or FusionX) Lora
|
||||
## FusioniX (or FusionX) Lora for Wan 2.1 / Wan 2.2
|
||||
If you need just one Lora accelerator use this one. It is a combination of multiple Loras acelerators (including Causvid below) and style loras. It will not only accelerate the video generation but it will also improve the quality. There are two versions of this lora whether you use it for t2v or i2v
|
||||
|
||||
### Usage
|
||||
@ -123,8 +184,8 @@ If you need just one Lora accelerator use this one. It is a combination of multi
|
||||
5. Set generation steps from 8-10
|
||||
6. Generate!
|
||||
|
||||
## Safe-Forcing lightx2v Lora (Video Generation Accelerator)
|
||||
Safeforcing Lora has been created by Kijai from the Safe-Forcing lightx2v distilled Wan model and can generate videos with only 2 steps and offers also a 2x speed improvement since it doesnt require classifier free guidance. It works on both t2v and i2v models
|
||||
## Self-Forcing lightx2v Lora (Video Generation Accelerator) for Wan 2.1 / Wan 2.2
|
||||
Selg forcing Lora has been created by Kijai from the Self-Forcing lightx2v distilled Wan model and can generate videos with only 2 steps and offers also a 2x speed improvement since it doesnt require classifier free guidance. It works on both t2v and i2v models
|
||||
You will find it under the name of *Wan21_T2V_14B_lightx2v_cfg_step_distill_lora_rank32.safetensors*
|
||||
|
||||
### Usage
|
||||
@ -140,7 +201,7 @@ You will find it under the name of *Wan21_T2V_14B_lightx2v_cfg_step_distill_lora
|
||||
6. Generate!
|
||||
|
||||
|
||||
## CausVid Lora (Video Generation Accelerator)
|
||||
## CausVid Lora (Video Generation Accelerator) for Wan 2.1 / Wan 2.2
|
||||
CausVid is a distilled Wan model that generates videos in 4-12 steps with 2x speed improvement.
|
||||
|
||||
### Usage
|
||||
@ -163,11 +224,10 @@ CausVid is a distilled Wan model that generates videos in 4-12 steps with 2x spe
|
||||
*Note: Lower steps = lower quality (especially motion)*
|
||||
|
||||
|
||||
## AccVid Lora (Video Generation Accelerator)
|
||||
## AccVid Lora (Video Generation Accelerator) for Wan 2.1 / Wan 2.2
|
||||
|
||||
AccVid is a distilled Wan model that generates videos with a 2x speed improvement since classifier free guidance is no longer needed (that is cfg = 1).
|
||||
|
||||
|
||||
### Usage
|
||||
1. Select a Wan t2v model (e.g., Wan 2.1 text2video 13B or Vace 13B) or Wan i2v model
|
||||
2. Enable Advanced Mode
|
||||
@ -176,6 +236,21 @@ AccVid is a distilled Wan model that generates videos with a 2x speed improvemen
|
||||
- Set Shift Scale = 5
|
||||
4. The number steps remain unchanged compared to what you would use with the original model but it will be two times faster since classifier free guidance is not needed
|
||||
|
||||
## Lightx2v 4 steps Lora (Video Generation Accelerator) for Wan 2.2
|
||||
This lora is in fact composed of two loras, one for the High model and one for the Low Wan 2.2 model.
|
||||
|
||||
You need to select these two loras and set the following Loras multipliers:
|
||||
|
||||
```
|
||||
1;0 0;1 (the High lora should be only enabled when only the High model is loaded, same for the Low lora)
|
||||
```
|
||||
|
||||
Don't forget to set guidance to 1 !
|
||||
## Qwen Image Lightning 4 steps / Lightning 8 steps
|
||||
Very powerful lora that you can use to reduce the number of steps from 30 to only 4 !
|
||||
Just install the lora in *lora_qwen* folder, select the lora and set Guidance to 1 and the number of steps to 4 or 8
|
||||
|
||||
|
||||
|
||||
https://huggingface.co/Kijai/WanVideo_comfy/blob/main/Wan21_T2V_14B_lightx2v_cfg_step_distill_lora_rank32.safetensors
|
||||
|
||||
@ -190,6 +265,7 @@ https://huggingface.co/Kijai/WanVideo_comfy/blob/main/Wan21_T2V_14B_lightx2v_cfg
|
||||
- Loras are loaded on-demand to save VRAM
|
||||
- Multiple loras can be used simultaneously
|
||||
- Time-based multipliers don't use extra memory
|
||||
- The order of Loras doesn't matter (as long as the loras multipliers are in the right order of course !)
|
||||
|
||||
## Finding Loras
|
||||
|
||||
@ -241,6 +317,7 @@ In the video, a man is presented. The man is in a city and looks at his watch.
|
||||
## Troubleshooting
|
||||
|
||||
### Lora Not Working
|
||||
0. If it is a lora accelerator, Guidance should be set to 1
|
||||
1. Check if lora is compatible with your model size (1.3B vs 14B)
|
||||
2. Verify lora format is supported
|
||||
3. Try different multiplier values
|
||||
@ -262,12 +339,13 @@ In the video, a man is presented. The man is in a city and looks at his watch.
|
||||
|
||||
```bash
|
||||
# Lora-related command line options
|
||||
--lora-dir path # Path to t2v loras directory
|
||||
--lora-dir path # Path to t2v loras directory
|
||||
--lora-dir-i2v path # Path to i2v loras directory
|
||||
--lora-dir-hunyuan path # Path to Hunyuan t2v loras
|
||||
--lora-dir-hunyuan-i2v path # Path to Hunyuan i2v loras
|
||||
--lora-dir-ltxv path # Path to LTX Video loras
|
||||
--lora-dir-flux path # Path to Flux loras
|
||||
--lora-dir-qwen path # Path to Qwen loras
|
||||
--lora-preset preset # Load preset on startup
|
||||
--check-loras # Filter incompatible loras
|
||||
```
|
||||
BIN
favicon.png
Normal file
|
After Width: | Height: | Size: 16 KiB |
@ -1,13 +0,0 @@
|
||||
try:
|
||||
from ._version import (
|
||||
version as __version__, # type: ignore
|
||||
version_tuple,
|
||||
)
|
||||
except ImportError:
|
||||
__version__ = "unknown (no version information available)"
|
||||
version_tuple = (0, 0, "unknown", "noinfo")
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
PACKAGE = __package__.replace("_", "-")
|
||||
PACKAGE_ROOT = Path(__file__).parent
|
||||
@ -1,109 +0,0 @@
|
||||
import os
|
||||
import re
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
from glob import iglob
|
||||
from mmgp import offload as offload
|
||||
import torch
|
||||
from wan.utils.utils import calculate_new_dimensions
|
||||
from flux.sampling import denoise, get_schedule, prepare_kontext, unpack
|
||||
from flux.modules.layers import get_linear_split_map
|
||||
from flux.util import (
|
||||
aspect_ratio_to_height_width,
|
||||
load_ae,
|
||||
load_clip,
|
||||
load_flow_model,
|
||||
load_t5,
|
||||
save_image,
|
||||
)
|
||||
|
||||
class model_factory:
|
||||
def __init__(
|
||||
self,
|
||||
checkpoint_dir,
|
||||
model_filename = None,
|
||||
model_type = None,
|
||||
base_model_type = None,
|
||||
text_encoder_filename = None,
|
||||
quantizeTransformer = False,
|
||||
save_quantized = False,
|
||||
dtype = torch.bfloat16,
|
||||
VAE_dtype = torch.float32,
|
||||
mixed_precision_transformer = False
|
||||
):
|
||||
self.device = torch.device(f"cuda")
|
||||
self.VAE_dtype = VAE_dtype
|
||||
self.dtype = dtype
|
||||
torch_device = "cpu"
|
||||
|
||||
self.t5 = load_t5(torch_device, text_encoder_filename, max_length=512)
|
||||
self.clip = load_clip(torch_device)
|
||||
self.name= "flux-dev-kontext"
|
||||
self.model = load_flow_model(self.name, model_filename[0], torch_device)
|
||||
|
||||
self.vae = load_ae(self.name, device=torch_device)
|
||||
|
||||
# offload.change_dtype(self.model, dtype, True)
|
||||
if save_quantized:
|
||||
from wgp import save_quantized_model
|
||||
save_quantized_model(self.model, model_type, model_filename[0], dtype, None)
|
||||
|
||||
split_linear_modules_map = get_linear_split_map()
|
||||
self.model.split_linear_modules_map = split_linear_modules_map
|
||||
offload.split_linear_modules(self.model, split_linear_modules_map )
|
||||
|
||||
|
||||
def generate(
|
||||
self,
|
||||
seed: int | None = None,
|
||||
input_prompt: str = "replace the logo with the text 'Black Forest Labs'",
|
||||
sampling_steps: int = 20,
|
||||
input_ref_images = None,
|
||||
width= 832,
|
||||
height=480,
|
||||
guide_scale: float = 2.5,
|
||||
fit_into_canvas = None,
|
||||
callback = None,
|
||||
loras_slists = None,
|
||||
batch_size = 1,
|
||||
**bbargs
|
||||
):
|
||||
|
||||
if self._interrupt:
|
||||
return None
|
||||
|
||||
device="cuda"
|
||||
if input_ref_images != None and len(input_ref_images) > 0:
|
||||
image_ref = input_ref_images[0]
|
||||
w, h = image_ref.size
|
||||
height, width = calculate_new_dimensions(height, width, h, w, fit_into_canvas)
|
||||
|
||||
inp, height, width = prepare_kontext(
|
||||
t5=self.t5,
|
||||
clip=self.clip,
|
||||
prompt=input_prompt,
|
||||
ae=self.vae,
|
||||
img_cond=image_ref,
|
||||
target_width=width,
|
||||
target_height=height,
|
||||
bs=batch_size,
|
||||
seed=seed,
|
||||
device=device,
|
||||
)
|
||||
|
||||
inp.pop("img_cond_orig")
|
||||
timesteps = get_schedule(sampling_steps, inp["img"].shape[1], shift=(self.name != "flux-schnell"))
|
||||
def unpack_latent(x):
|
||||
return unpack(x.float(), height, width)
|
||||
# denoise initial noise
|
||||
x = denoise(self.model, **inp, timesteps=timesteps, guidance=guide_scale, callback=callback, pipeline=self, loras_slists= loras_slists, unpack_latent = unpack_latent)
|
||||
if x==None: return None
|
||||
# decode latents to pixel space
|
||||
x = unpack_latent(x)
|
||||
with torch.autocast(device_type=device, dtype=torch.bfloat16):
|
||||
x = self.vae.decode(x)
|
||||
|
||||
x = x.clamp(-1, 1)
|
||||
x = x.transpose(0, 1)
|
||||
return x
|
||||
|
||||
168
flux/model.py
@ -1,168 +0,0 @@
|
||||
from dataclasses import dataclass
|
||||
|
||||
import torch
|
||||
from torch import Tensor, nn
|
||||
|
||||
from flux.modules.layers import (
|
||||
DoubleStreamBlock,
|
||||
EmbedND,
|
||||
LastLayer,
|
||||
MLPEmbedder,
|
||||
SingleStreamBlock,
|
||||
timestep_embedding,
|
||||
)
|
||||
from flux.modules.lora import LinearLora, replace_linear_with_lora
|
||||
|
||||
|
||||
@dataclass
|
||||
class FluxParams:
|
||||
in_channels: int
|
||||
out_channels: int
|
||||
vec_in_dim: int
|
||||
context_in_dim: int
|
||||
hidden_size: int
|
||||
mlp_ratio: float
|
||||
num_heads: int
|
||||
depth: int
|
||||
depth_single_blocks: int
|
||||
axes_dim: list[int]
|
||||
theta: int
|
||||
qkv_bias: bool
|
||||
guidance_embed: bool
|
||||
|
||||
|
||||
class Flux(nn.Module):
|
||||
"""
|
||||
Transformer model for flow matching on sequences.
|
||||
"""
|
||||
|
||||
def __init__(self, params: FluxParams):
|
||||
super().__init__()
|
||||
|
||||
self.params = params
|
||||
self.in_channels = params.in_channels
|
||||
self.out_channels = params.out_channels
|
||||
if params.hidden_size % params.num_heads != 0:
|
||||
raise ValueError(
|
||||
f"Hidden size {params.hidden_size} must be divisible by num_heads {params.num_heads}"
|
||||
)
|
||||
pe_dim = params.hidden_size // params.num_heads
|
||||
if sum(params.axes_dim) != pe_dim:
|
||||
raise ValueError(f"Got {params.axes_dim} but expected positional dim {pe_dim}")
|
||||
self.hidden_size = params.hidden_size
|
||||
self.num_heads = params.num_heads
|
||||
self.pe_embedder = EmbedND(dim=pe_dim, theta=params.theta, axes_dim=params.axes_dim)
|
||||
self.img_in = nn.Linear(self.in_channels, self.hidden_size, bias=True)
|
||||
self.time_in = MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size)
|
||||
self.vector_in = MLPEmbedder(params.vec_in_dim, self.hidden_size)
|
||||
self.guidance_in = (
|
||||
MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size) if params.guidance_embed else nn.Identity()
|
||||
)
|
||||
self.txt_in = nn.Linear(params.context_in_dim, self.hidden_size)
|
||||
|
||||
self.double_blocks = nn.ModuleList(
|
||||
[
|
||||
DoubleStreamBlock(
|
||||
self.hidden_size,
|
||||
self.num_heads,
|
||||
mlp_ratio=params.mlp_ratio,
|
||||
qkv_bias=params.qkv_bias,
|
||||
)
|
||||
for _ in range(params.depth)
|
||||
]
|
||||
)
|
||||
|
||||
self.single_blocks = nn.ModuleList(
|
||||
[
|
||||
SingleStreamBlock(self.hidden_size, self.num_heads, mlp_ratio=params.mlp_ratio)
|
||||
for _ in range(params.depth_single_blocks)
|
||||
]
|
||||
)
|
||||
|
||||
self.final_layer = LastLayer(self.hidden_size, 1, self.out_channels)
|
||||
|
||||
def preprocess_loras(self, model_type, sd):
|
||||
new_sd = {}
|
||||
if len(sd) == 0: return sd
|
||||
|
||||
first_key= next(iter(sd))
|
||||
if first_key.startswith("transformer."):
|
||||
src_list = [".attn.to_q.", ".attn.to_k.", ".attn.to_v."]
|
||||
tgt_list = [".linear1_attn_q.", ".linear1_attn_k.", ".linear1_attn_v."]
|
||||
for k,v in sd.items():
|
||||
k = k.replace("transformer.single_transformer_blocks", "diffusion_model.single_blocks")
|
||||
k = k.replace("transformer.double_transformer_blocks", "diffusion_model.double_blocks")
|
||||
for src, tgt in zip(src_list, tgt_list):
|
||||
k = k.replace(src, tgt)
|
||||
|
||||
new_sd[k] = v
|
||||
|
||||
return new_sd
|
||||
|
||||
def forward(
|
||||
self,
|
||||
img: Tensor,
|
||||
img_ids: Tensor,
|
||||
txt: Tensor,
|
||||
txt_ids: Tensor,
|
||||
timesteps: Tensor,
|
||||
y: Tensor,
|
||||
guidance: Tensor | None = None,
|
||||
callback= None,
|
||||
pipeline =None,
|
||||
|
||||
) -> Tensor:
|
||||
if img.ndim != 3 or txt.ndim != 3:
|
||||
raise ValueError("Input img and txt tensors must have 3 dimensions.")
|
||||
|
||||
# running on sequences img
|
||||
img = self.img_in(img)
|
||||
vec = self.time_in(timestep_embedding(timesteps, 256))
|
||||
if self.params.guidance_embed:
|
||||
if guidance is None:
|
||||
raise ValueError("Didn't get guidance strength for guidance distilled model.")
|
||||
vec += self.guidance_in(timestep_embedding(guidance, 256))
|
||||
vec += self.vector_in(y)
|
||||
txt = self.txt_in(txt)
|
||||
|
||||
ids = torch.cat((txt_ids, img_ids), dim=1)
|
||||
pe = self.pe_embedder(ids)
|
||||
|
||||
for block in self.double_blocks:
|
||||
if callback != None:
|
||||
callback(-1, None, False, True)
|
||||
if pipeline._interrupt:
|
||||
return None
|
||||
img, txt = block(img=img, txt=txt, vec=vec, pe=pe)
|
||||
|
||||
img = torch.cat((txt, img), 1)
|
||||
for block in self.single_blocks:
|
||||
img = block(img, vec=vec, pe=pe)
|
||||
img = img[:, txt.shape[1] :, ...]
|
||||
|
||||
img = self.final_layer(img, vec) # (N, T, patch_size ** 2 * out_channels)
|
||||
return img
|
||||
|
||||
|
||||
class FluxLoraWrapper(Flux):
|
||||
def __init__(
|
||||
self,
|
||||
lora_rank: int = 128,
|
||||
lora_scale: float = 1.0,
|
||||
*args,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
self.lora_rank = lora_rank
|
||||
|
||||
replace_linear_with_lora(
|
||||
self,
|
||||
max_rank=lora_rank,
|
||||
scale=lora_scale,
|
||||
)
|
||||
|
||||
def set_lora_scale(self, scale: float) -> None:
|
||||
for module in self.modules():
|
||||
if isinstance(module, LinearLora):
|
||||
module.set_scale(scale=scale)
|
||||
392
flux/sampling.py
@ -1,392 +0,0 @@
|
||||
import math
|
||||
from typing import Callable
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from einops import rearrange, repeat
|
||||
from PIL import Image
|
||||
from torch import Tensor
|
||||
|
||||
from .model import Flux
|
||||
from .modules.autoencoder import AutoEncoder
|
||||
from .modules.conditioner import HFEmbedder
|
||||
from .modules.image_embedders import CannyImageEncoder, DepthImageEncoder, ReduxImageEncoder
|
||||
from .util import PREFERED_KONTEXT_RESOLUTIONS
|
||||
from einops import rearrange, repeat
|
||||
|
||||
|
||||
def get_noise(
|
||||
num_samples: int,
|
||||
height: int,
|
||||
width: int,
|
||||
device: torch.device,
|
||||
dtype: torch.dtype,
|
||||
seed: int,
|
||||
):
|
||||
return torch.randn(
|
||||
num_samples,
|
||||
16,
|
||||
# allow for packing
|
||||
2 * math.ceil(height / 16),
|
||||
2 * math.ceil(width / 16),
|
||||
dtype=dtype,
|
||||
generator=torch.Generator(device=device).manual_seed(seed),
|
||||
)
|
||||
|
||||
|
||||
def prepare(t5: HFEmbedder, clip: HFEmbedder, img: Tensor, prompt: str | list[str]) -> dict[str, Tensor]:
|
||||
bs, c, h, w = img.shape
|
||||
if bs == 1 and not isinstance(prompt, str):
|
||||
bs = len(prompt)
|
||||
|
||||
img = rearrange(img, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=2, pw=2)
|
||||
if img.shape[0] == 1 and bs > 1:
|
||||
img = repeat(img, "1 ... -> bs ...", bs=bs)
|
||||
|
||||
img_ids = torch.zeros(h // 2, w // 2, 3)
|
||||
img_ids[..., 1] = img_ids[..., 1] + torch.arange(h // 2)[:, None]
|
||||
img_ids[..., 2] = img_ids[..., 2] + torch.arange(w // 2)[None, :]
|
||||
img_ids = repeat(img_ids, "h w c -> b (h w) c", b=bs)
|
||||
|
||||
if isinstance(prompt, str):
|
||||
prompt = [prompt]
|
||||
txt = t5(prompt)
|
||||
if txt.shape[0] == 1 and bs > 1:
|
||||
txt = repeat(txt, "1 ... -> bs ...", bs=bs)
|
||||
txt_ids = torch.zeros(bs, txt.shape[1], 3)
|
||||
|
||||
vec = clip(prompt)
|
||||
if vec.shape[0] == 1 and bs > 1:
|
||||
vec = repeat(vec, "1 ... -> bs ...", bs=bs)
|
||||
|
||||
return {
|
||||
"img": img,
|
||||
"img_ids": img_ids.to(img.device),
|
||||
"txt": txt.to(img.device),
|
||||
"txt_ids": txt_ids.to(img.device),
|
||||
"vec": vec.to(img.device),
|
||||
}
|
||||
|
||||
|
||||
def prepare_control(
|
||||
t5: HFEmbedder,
|
||||
clip: HFEmbedder,
|
||||
img: Tensor,
|
||||
prompt: str | list[str],
|
||||
ae: AutoEncoder,
|
||||
encoder: DepthImageEncoder | CannyImageEncoder,
|
||||
img_cond_path: str,
|
||||
) -> dict[str, Tensor]:
|
||||
# load and encode the conditioning image
|
||||
bs, _, h, w = img.shape
|
||||
if bs == 1 and not isinstance(prompt, str):
|
||||
bs = len(prompt)
|
||||
|
||||
img_cond = Image.open(img_cond_path).convert("RGB")
|
||||
|
||||
width = w * 8
|
||||
height = h * 8
|
||||
img_cond = img_cond.resize((width, height), Image.Resampling.LANCZOS)
|
||||
img_cond = np.array(img_cond)
|
||||
img_cond = torch.from_numpy(img_cond).float() / 127.5 - 1.0
|
||||
img_cond = rearrange(img_cond, "h w c -> 1 c h w")
|
||||
|
||||
with torch.no_grad():
|
||||
img_cond = encoder(img_cond)
|
||||
img_cond = ae.encode(img_cond)
|
||||
|
||||
img_cond = img_cond.to(torch.bfloat16)
|
||||
img_cond = rearrange(img_cond, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=2, pw=2)
|
||||
if img_cond.shape[0] == 1 and bs > 1:
|
||||
img_cond = repeat(img_cond, "1 ... -> bs ...", bs=bs)
|
||||
|
||||
return_dict = prepare(t5, clip, img, prompt)
|
||||
return_dict["img_cond"] = img_cond
|
||||
return return_dict
|
||||
|
||||
|
||||
def prepare_fill(
|
||||
t5: HFEmbedder,
|
||||
clip: HFEmbedder,
|
||||
img: Tensor,
|
||||
prompt: str | list[str],
|
||||
ae: AutoEncoder,
|
||||
img_cond_path: str,
|
||||
mask_path: str,
|
||||
) -> dict[str, Tensor]:
|
||||
# load and encode the conditioning image and the mask
|
||||
bs, _, _, _ = img.shape
|
||||
if bs == 1 and not isinstance(prompt, str):
|
||||
bs = len(prompt)
|
||||
|
||||
img_cond = Image.open(img_cond_path).convert("RGB")
|
||||
img_cond = np.array(img_cond)
|
||||
img_cond = torch.from_numpy(img_cond).float() / 127.5 - 1.0
|
||||
img_cond = rearrange(img_cond, "h w c -> 1 c h w")
|
||||
|
||||
mask = Image.open(mask_path).convert("L")
|
||||
mask = np.array(mask)
|
||||
mask = torch.from_numpy(mask).float() / 255.0
|
||||
mask = rearrange(mask, "h w -> 1 1 h w")
|
||||
|
||||
with torch.no_grad():
|
||||
img_cond = img_cond.to(img.device)
|
||||
mask = mask.to(img.device)
|
||||
img_cond = img_cond * (1 - mask)
|
||||
img_cond = ae.encode(img_cond)
|
||||
mask = mask[:, 0, :, :]
|
||||
mask = mask.to(torch.bfloat16)
|
||||
mask = rearrange(
|
||||
mask,
|
||||
"b (h ph) (w pw) -> b (ph pw) h w",
|
||||
ph=8,
|
||||
pw=8,
|
||||
)
|
||||
mask = rearrange(mask, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=2, pw=2)
|
||||
if mask.shape[0] == 1 and bs > 1:
|
||||
mask = repeat(mask, "1 ... -> bs ...", bs=bs)
|
||||
|
||||
img_cond = img_cond.to(torch.bfloat16)
|
||||
img_cond = rearrange(img_cond, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=2, pw=2)
|
||||
if img_cond.shape[0] == 1 and bs > 1:
|
||||
img_cond = repeat(img_cond, "1 ... -> bs ...", bs=bs)
|
||||
|
||||
img_cond = torch.cat((img_cond, mask), dim=-1)
|
||||
|
||||
return_dict = prepare(t5, clip, img, prompt)
|
||||
return_dict["img_cond"] = img_cond.to(img.device)
|
||||
return return_dict
|
||||
|
||||
|
||||
def prepare_redux(
|
||||
t5: HFEmbedder,
|
||||
clip: HFEmbedder,
|
||||
img: Tensor,
|
||||
prompt: str | list[str],
|
||||
encoder: ReduxImageEncoder,
|
||||
img_cond_path: str,
|
||||
) -> dict[str, Tensor]:
|
||||
bs, _, h, w = img.shape
|
||||
if bs == 1 and not isinstance(prompt, str):
|
||||
bs = len(prompt)
|
||||
|
||||
img_cond = Image.open(img_cond_path).convert("RGB")
|
||||
with torch.no_grad():
|
||||
img_cond = encoder(img_cond)
|
||||
|
||||
img_cond = img_cond.to(torch.bfloat16)
|
||||
if img_cond.shape[0] == 1 and bs > 1:
|
||||
img_cond = repeat(img_cond, "1 ... -> bs ...", bs=bs)
|
||||
|
||||
img = rearrange(img, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=2, pw=2)
|
||||
if img.shape[0] == 1 and bs > 1:
|
||||
img = repeat(img, "1 ... -> bs ...", bs=bs)
|
||||
|
||||
img_ids = torch.zeros(h // 2, w // 2, 3)
|
||||
img_ids[..., 1] = img_ids[..., 1] + torch.arange(h // 2)[:, None]
|
||||
img_ids[..., 2] = img_ids[..., 2] + torch.arange(w // 2)[None, :]
|
||||
img_ids = repeat(img_ids, "h w c -> b (h w) c", b=bs)
|
||||
|
||||
if isinstance(prompt, str):
|
||||
prompt = [prompt]
|
||||
txt = t5(prompt)
|
||||
txt = torch.cat((txt, img_cond.to(txt)), dim=-2)
|
||||
if txt.shape[0] == 1 and bs > 1:
|
||||
txt = repeat(txt, "1 ... -> bs ...", bs=bs)
|
||||
txt_ids = torch.zeros(bs, txt.shape[1], 3)
|
||||
|
||||
vec = clip(prompt)
|
||||
if vec.shape[0] == 1 and bs > 1:
|
||||
vec = repeat(vec, "1 ... -> bs ...", bs=bs)
|
||||
|
||||
return {
|
||||
"img": img,
|
||||
"img_ids": img_ids.to(img.device),
|
||||
"txt": txt.to(img.device),
|
||||
"txt_ids": txt_ids.to(img.device),
|
||||
"vec": vec.to(img.device),
|
||||
}
|
||||
|
||||
|
||||
def prepare_kontext(
|
||||
t5: HFEmbedder,
|
||||
clip: HFEmbedder,
|
||||
prompt: str | list[str],
|
||||
ae: AutoEncoder,
|
||||
img_cond: str,
|
||||
seed: int,
|
||||
device: torch.device,
|
||||
target_width: int | None = None,
|
||||
target_height: int | None = None,
|
||||
bs: int = 1,
|
||||
) -> tuple[dict[str, Tensor], int, int]:
|
||||
# load and encode the conditioning image
|
||||
if bs == 1 and not isinstance(prompt, str):
|
||||
bs = len(prompt)
|
||||
|
||||
width, height = img_cond.size
|
||||
aspect_ratio = width / height
|
||||
|
||||
# Kontext is trained on specific resolutions, using one of them is recommended
|
||||
_, width, height = min((abs(aspect_ratio - w / h), w, h) for w, h in PREFERED_KONTEXT_RESOLUTIONS)
|
||||
|
||||
width = 2 * int(width / 16)
|
||||
height = 2 * int(height / 16)
|
||||
|
||||
img_cond = img_cond.resize((8 * width, 8 * height), Image.Resampling.LANCZOS)
|
||||
img_cond = np.array(img_cond)
|
||||
img_cond = torch.from_numpy(img_cond).float() / 127.5 - 1.0
|
||||
img_cond = rearrange(img_cond, "h w c -> 1 c h w")
|
||||
img_cond_orig = img_cond.clone()
|
||||
|
||||
with torch.no_grad():
|
||||
img_cond = ae.encode(img_cond.to(device))
|
||||
|
||||
img_cond = img_cond.to(torch.bfloat16)
|
||||
img_cond = rearrange(img_cond, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=2, pw=2)
|
||||
if img_cond.shape[0] == 1 and bs > 1:
|
||||
img_cond = repeat(img_cond, "1 ... -> bs ...", bs=bs)
|
||||
|
||||
# image ids are the same as base image with the first dimension set to 1
|
||||
# instead of 0
|
||||
img_cond_ids = torch.zeros(height // 2, width // 2, 3)
|
||||
img_cond_ids[..., 0] = 1
|
||||
img_cond_ids[..., 1] = img_cond_ids[..., 1] + torch.arange(height // 2)[:, None]
|
||||
img_cond_ids[..., 2] = img_cond_ids[..., 2] + torch.arange(width // 2)[None, :]
|
||||
img_cond_ids = repeat(img_cond_ids, "h w c -> b (h w) c", b=bs)
|
||||
|
||||
if target_width is None:
|
||||
target_width = 8 * width
|
||||
if target_height is None:
|
||||
target_height = 8 * height
|
||||
|
||||
img = get_noise(
|
||||
bs,
|
||||
target_height,
|
||||
target_width,
|
||||
device=device,
|
||||
dtype=torch.bfloat16,
|
||||
seed=seed,
|
||||
)
|
||||
|
||||
return_dict = prepare(t5, clip, img, prompt)
|
||||
return_dict["img_cond_seq"] = img_cond
|
||||
return_dict["img_cond_seq_ids"] = img_cond_ids.to(device)
|
||||
return_dict["img_cond_orig"] = img_cond_orig
|
||||
return return_dict, target_height, target_width
|
||||
|
||||
|
||||
def time_shift(mu: float, sigma: float, t: Tensor):
|
||||
return math.exp(mu) / (math.exp(mu) + (1 / t - 1) ** sigma)
|
||||
|
||||
|
||||
def get_lin_function(
|
||||
x1: float = 256, y1: float = 0.5, x2: float = 4096, y2: float = 1.15
|
||||
) -> Callable[[float], float]:
|
||||
m = (y2 - y1) / (x2 - x1)
|
||||
b = y1 - m * x1
|
||||
return lambda x: m * x + b
|
||||
|
||||
|
||||
def get_schedule(
|
||||
num_steps: int,
|
||||
image_seq_len: int,
|
||||
base_shift: float = 0.5,
|
||||
max_shift: float = 1.15,
|
||||
shift: bool = True,
|
||||
) -> list[float]:
|
||||
# extra step for zero
|
||||
timesteps = torch.linspace(1, 0, num_steps + 1)
|
||||
|
||||
# shifting the schedule to favor high timesteps for higher signal images
|
||||
if shift:
|
||||
# estimate mu based on linear estimation between two points
|
||||
mu = get_lin_function(y1=base_shift, y2=max_shift)(image_seq_len)
|
||||
timesteps = time_shift(mu, 1.0, timesteps)
|
||||
|
||||
return timesteps.tolist()
|
||||
|
||||
|
||||
def denoise(
|
||||
model: Flux,
|
||||
# model input
|
||||
img: Tensor,
|
||||
img_ids: Tensor,
|
||||
txt: Tensor,
|
||||
txt_ids: Tensor,
|
||||
vec: Tensor,
|
||||
# sampling parameters
|
||||
timesteps: list[float],
|
||||
guidance: float = 4.0,
|
||||
# extra img tokens (channel-wise)
|
||||
img_cond: Tensor | None = None,
|
||||
# extra img tokens (sequence-wise)
|
||||
img_cond_seq: Tensor | None = None,
|
||||
img_cond_seq_ids: Tensor | None = None,
|
||||
callback=None,
|
||||
pipeline=None,
|
||||
loras_slists=None,
|
||||
unpack_latent = None,
|
||||
):
|
||||
|
||||
kwargs = {'pipeline': pipeline, 'callback': callback}
|
||||
if callback != None:
|
||||
callback(-1, None, True)
|
||||
|
||||
updated_num_steps= len(timesteps) -1
|
||||
if callback != None:
|
||||
from wgp import update_loras_slists
|
||||
update_loras_slists(model, loras_slists, updated_num_steps)
|
||||
callback(-1, None, True, override_num_inference_steps = updated_num_steps)
|
||||
from mmgp import offload
|
||||
# this is ignored for schnell
|
||||
guidance_vec = torch.full((img.shape[0],), guidance, device=img.device, dtype=img.dtype)
|
||||
for i, (t_curr, t_prev) in enumerate(zip(timesteps[:-1], timesteps[1:])):
|
||||
offload.set_step_no_for_lora(model, i)
|
||||
if pipeline._interrupt:
|
||||
return None
|
||||
|
||||
t_vec = torch.full((img.shape[0],), t_curr, dtype=img.dtype, device=img.device)
|
||||
img_input = img
|
||||
img_input_ids = img_ids
|
||||
if img_cond is not None:
|
||||
img_input = torch.cat((img, img_cond), dim=-1)
|
||||
if img_cond_seq is not None:
|
||||
assert (
|
||||
img_cond_seq_ids is not None
|
||||
), "You need to provide either both or neither of the sequence conditioning"
|
||||
img_input = torch.cat((img_input, img_cond_seq), dim=1)
|
||||
img_input_ids = torch.cat((img_input_ids, img_cond_seq_ids), dim=1)
|
||||
pred = model(
|
||||
img=img_input,
|
||||
img_ids=img_input_ids,
|
||||
txt=txt,
|
||||
txt_ids=txt_ids,
|
||||
y=vec,
|
||||
timesteps=t_vec,
|
||||
guidance=guidance_vec,
|
||||
**kwargs
|
||||
)
|
||||
if pred == None: return None
|
||||
|
||||
if img_input_ids is not None:
|
||||
pred = pred[:, : img.shape[1]]
|
||||
|
||||
img += (t_prev - t_curr) * pred
|
||||
if callback is not None:
|
||||
preview = unpack_latent(img).transpose(0,1)
|
||||
callback(i, preview, False)
|
||||
|
||||
|
||||
return img
|
||||
|
||||
|
||||
def unpack(x: Tensor, height: int, width: int) -> Tensor:
|
||||
return rearrange(
|
||||
x,
|
||||
"b (h w) (c ph pw) -> b c (h ph) (w pw)",
|
||||
h=math.ceil(height / 16),
|
||||
w=math.ceil(width / 16),
|
||||
ph=2,
|
||||
pw=2,
|
||||
)
|
||||
@ -1,302 +0,0 @@
|
||||
import os
|
||||
import re
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
from glob import iglob
|
||||
|
||||
import torch
|
||||
from fire import Fire
|
||||
from transformers import pipeline
|
||||
|
||||
from flux.sampling import denoise, get_noise, get_schedule, prepare, unpack
|
||||
from flux.util import (
|
||||
check_onnx_access_for_trt,
|
||||
configs,
|
||||
load_ae,
|
||||
load_clip,
|
||||
load_flow_model,
|
||||
load_t5,
|
||||
save_image,
|
||||
)
|
||||
|
||||
NSFW_THRESHOLD = 0.85
|
||||
|
||||
|
||||
@dataclass
|
||||
class SamplingOptions:
|
||||
prompt: str
|
||||
width: int
|
||||
height: int
|
||||
num_steps: int
|
||||
guidance: float
|
||||
seed: int | None
|
||||
|
||||
|
||||
def parse_prompt(options: SamplingOptions) -> SamplingOptions | None:
|
||||
user_question = "Next prompt (write /h for help, /q to quit and leave empty to repeat):\n"
|
||||
usage = (
|
||||
"Usage: Either write your prompt directly, leave this field empty "
|
||||
"to repeat the prompt or write a command starting with a slash:\n"
|
||||
"- '/w <width>' will set the width of the generated image\n"
|
||||
"- '/h <height>' will set the height of the generated image\n"
|
||||
"- '/s <seed>' sets the next seed\n"
|
||||
"- '/g <guidance>' sets the guidance (flux-dev only)\n"
|
||||
"- '/n <steps>' sets the number of steps\n"
|
||||
"- '/q' to quit"
|
||||
)
|
||||
|
||||
while (prompt := input(user_question)).startswith("/"):
|
||||
if prompt.startswith("/w"):
|
||||
if prompt.count(" ") != 1:
|
||||
print(f"Got invalid command '{prompt}'\n{usage}")
|
||||
continue
|
||||
_, width = prompt.split()
|
||||
options.width = 16 * (int(width) // 16)
|
||||
print(
|
||||
f"Setting resolution to {options.width} x {options.height} "
|
||||
f"({options.height * options.width / 1e6:.2f}MP)"
|
||||
)
|
||||
elif prompt.startswith("/h"):
|
||||
if prompt.count(" ") != 1:
|
||||
print(f"Got invalid command '{prompt}'\n{usage}")
|
||||
continue
|
||||
_, height = prompt.split()
|
||||
options.height = 16 * (int(height) // 16)
|
||||
print(
|
||||
f"Setting resolution to {options.width} x {options.height} "
|
||||
f"({options.height * options.width / 1e6:.2f}MP)"
|
||||
)
|
||||
elif prompt.startswith("/g"):
|
||||
if prompt.count(" ") != 1:
|
||||
print(f"Got invalid command '{prompt}'\n{usage}")
|
||||
continue
|
||||
_, guidance = prompt.split()
|
||||
options.guidance = float(guidance)
|
||||
print(f"Setting guidance to {options.guidance}")
|
||||
elif prompt.startswith("/s"):
|
||||
if prompt.count(" ") != 1:
|
||||
print(f"Got invalid command '{prompt}'\n{usage}")
|
||||
continue
|
||||
_, seed = prompt.split()
|
||||
options.seed = int(seed)
|
||||
print(f"Setting seed to {options.seed}")
|
||||
elif prompt.startswith("/n"):
|
||||
if prompt.count(" ") != 1:
|
||||
print(f"Got invalid command '{prompt}'\n{usage}")
|
||||
continue
|
||||
_, steps = prompt.split()
|
||||
options.num_steps = int(steps)
|
||||
print(f"Setting number of steps to {options.num_steps}")
|
||||
elif prompt.startswith("/q"):
|
||||
print("Quitting")
|
||||
return None
|
||||
else:
|
||||
if not prompt.startswith("/h"):
|
||||
print(f"Got invalid command '{prompt}'\n{usage}")
|
||||
print(usage)
|
||||
if prompt != "":
|
||||
options.prompt = prompt
|
||||
return options
|
||||
|
||||
|
||||
@torch.inference_mode()
|
||||
def main(
|
||||
name: str = "flux-schnell",
|
||||
width: int = 1360,
|
||||
height: int = 768,
|
||||
seed: int | None = None,
|
||||
prompt: str = (
|
||||
"a photo of a forest with mist swirling around the tree trunks. The word "
|
||||
'"FLUX" is painted over it in big, red brush strokes with visible texture'
|
||||
),
|
||||
device: str = "cuda" if torch.cuda.is_available() else "cpu",
|
||||
num_steps: int | None = None,
|
||||
loop: bool = False,
|
||||
guidance: float = 2.5,
|
||||
offload: bool = False,
|
||||
output_dir: str = "output",
|
||||
add_sampling_metadata: bool = True,
|
||||
trt: bool = False,
|
||||
trt_transformer_precision: str = "bf16",
|
||||
track_usage: bool = False,
|
||||
):
|
||||
"""
|
||||
Sample the flux model. Either interactively (set `--loop`) or run for a
|
||||
single image.
|
||||
|
||||
Args:
|
||||
name: Name of the model to load
|
||||
height: height of the sample in pixels (should be a multiple of 16)
|
||||
width: width of the sample in pixels (should be a multiple of 16)
|
||||
seed: Set a seed for sampling
|
||||
output_name: where to save the output image, `{idx}` will be replaced
|
||||
by the index of the sample
|
||||
prompt: Prompt used for sampling
|
||||
device: Pytorch device
|
||||
num_steps: number of sampling steps (default 4 for schnell, 50 for guidance distilled)
|
||||
loop: start an interactive session and sample multiple times
|
||||
guidance: guidance value used for guidance distillation
|
||||
add_sampling_metadata: Add the prompt to the image Exif metadata
|
||||
trt: use TensorRT backend for optimized inference
|
||||
trt_transformer_precision: specify transformer precision for inference
|
||||
track_usage: track usage of the model for licensing purposes
|
||||
"""
|
||||
|
||||
prompt = prompt.split("|")
|
||||
if len(prompt) == 1:
|
||||
prompt = prompt[0]
|
||||
additional_prompts = None
|
||||
else:
|
||||
additional_prompts = prompt[1:]
|
||||
prompt = prompt[0]
|
||||
|
||||
assert not (
|
||||
(additional_prompts is not None) and loop
|
||||
), "Do not provide additional prompts and set loop to True"
|
||||
|
||||
nsfw_classifier = pipeline("image-classification", model="Falconsai/nsfw_image_detection", device=device)
|
||||
|
||||
if name not in configs:
|
||||
available = ", ".join(configs.keys())
|
||||
raise ValueError(f"Got unknown model name: {name}, chose from {available}")
|
||||
|
||||
torch_device = torch.device(device)
|
||||
if num_steps is None:
|
||||
num_steps = 4 if name == "flux-schnell" else 50
|
||||
|
||||
# allow for packing and conversion to latent space
|
||||
height = 16 * (height // 16)
|
||||
width = 16 * (width // 16)
|
||||
|
||||
output_name = os.path.join(output_dir, "img_{idx}.jpg")
|
||||
if not os.path.exists(output_dir):
|
||||
os.makedirs(output_dir)
|
||||
idx = 0
|
||||
else:
|
||||
fns = [fn for fn in iglob(output_name.format(idx="*")) if re.search(r"img_[0-9]+\.jpg$", fn)]
|
||||
if len(fns) > 0:
|
||||
idx = max(int(fn.split("_")[-1].split(".")[0]) for fn in fns) + 1
|
||||
else:
|
||||
idx = 0
|
||||
|
||||
if not trt:
|
||||
t5 = load_t5(torch_device, max_length=256 if name == "flux-schnell" else 512)
|
||||
clip = load_clip(torch_device)
|
||||
model = load_flow_model(name, device="cpu" if offload else torch_device)
|
||||
ae = load_ae(name, device="cpu" if offload else torch_device)
|
||||
else:
|
||||
# lazy import to make install optional
|
||||
from flux.trt.trt_manager import ModuleName, TRTManager
|
||||
|
||||
# Check if we need ONNX model access (which requires authentication for FLUX models)
|
||||
onnx_dir = check_onnx_access_for_trt(name, trt_transformer_precision)
|
||||
|
||||
trt_ctx_manager = TRTManager(
|
||||
trt_transformer_precision=trt_transformer_precision,
|
||||
trt_t5_precision=os.getenv("TRT_T5_PRECISION", "bf16"),
|
||||
)
|
||||
engines = trt_ctx_manager.load_engines(
|
||||
model_name=name,
|
||||
module_names={
|
||||
ModuleName.CLIP,
|
||||
ModuleName.TRANSFORMER,
|
||||
ModuleName.T5,
|
||||
ModuleName.VAE,
|
||||
},
|
||||
engine_dir=os.environ.get("TRT_ENGINE_DIR", "./engines"),
|
||||
custom_onnx_paths=onnx_dir or os.environ.get("CUSTOM_ONNX_PATHS", ""),
|
||||
trt_image_height=height,
|
||||
trt_image_width=width,
|
||||
trt_batch_size=1,
|
||||
trt_timing_cache=os.getenv("TRT_TIMING_CACHE_FILE", None),
|
||||
trt_static_batch=False,
|
||||
trt_static_shape=False,
|
||||
)
|
||||
|
||||
ae = engines[ModuleName.VAE].to(device="cpu" if offload else torch_device)
|
||||
model = engines[ModuleName.TRANSFORMER].to(device="cpu" if offload else torch_device)
|
||||
clip = engines[ModuleName.CLIP].to(torch_device)
|
||||
t5 = engines[ModuleName.T5].to(device="cpu" if offload else torch_device)
|
||||
|
||||
rng = torch.Generator(device="cpu")
|
||||
opts = SamplingOptions(
|
||||
prompt=prompt,
|
||||
width=width,
|
||||
height=height,
|
||||
num_steps=num_steps,
|
||||
guidance=guidance,
|
||||
seed=seed,
|
||||
)
|
||||
|
||||
if loop:
|
||||
opts = parse_prompt(opts)
|
||||
|
||||
while opts is not None:
|
||||
if opts.seed is None:
|
||||
opts.seed = rng.seed()
|
||||
print(f"Generating with seed {opts.seed}:\n{opts.prompt}")
|
||||
t0 = time.perf_counter()
|
||||
|
||||
# prepare input
|
||||
x = get_noise(
|
||||
1,
|
||||
opts.height,
|
||||
opts.width,
|
||||
device=torch_device,
|
||||
dtype=torch.bfloat16,
|
||||
seed=opts.seed,
|
||||
)
|
||||
opts.seed = None
|
||||
if offload:
|
||||
ae = ae.cpu()
|
||||
torch.cuda.empty_cache()
|
||||
t5, clip = t5.to(torch_device), clip.to(torch_device)
|
||||
inp = prepare(t5, clip, x, prompt=opts.prompt)
|
||||
timesteps = get_schedule(opts.num_steps, inp["img"].shape[1], shift=(name != "flux-schnell"))
|
||||
|
||||
# offload TEs to CPU, load model to gpu
|
||||
if offload:
|
||||
t5, clip = t5.cpu(), clip.cpu()
|
||||
torch.cuda.empty_cache()
|
||||
model = model.to(torch_device)
|
||||
|
||||
# denoise initial noise
|
||||
x = denoise(model, **inp, timesteps=timesteps, guidance=opts.guidance)
|
||||
|
||||
# offload model, load autoencoder to gpu
|
||||
if offload:
|
||||
model.cpu()
|
||||
torch.cuda.empty_cache()
|
||||
ae.decoder.to(x.device)
|
||||
|
||||
# decode latents to pixel space
|
||||
x = unpack(x.float(), opts.height, opts.width)
|
||||
with torch.autocast(device_type=torch_device.type, dtype=torch.bfloat16):
|
||||
x = ae.decode(x)
|
||||
|
||||
if torch.cuda.is_available():
|
||||
torch.cuda.synchronize()
|
||||
t1 = time.perf_counter()
|
||||
|
||||
fn = output_name.format(idx=idx)
|
||||
print(f"Done in {t1 - t0:.1f}s. Saving {fn}")
|
||||
|
||||
idx = save_image(
|
||||
nsfw_classifier, name, output_name, idx, x, add_sampling_metadata, prompt, track_usage=track_usage
|
||||
)
|
||||
|
||||
if loop:
|
||||
print("-" * 80)
|
||||
opts = parse_prompt(opts)
|
||||
elif additional_prompts:
|
||||
next_prompt = additional_prompts.pop(0)
|
||||
opts.prompt = next_prompt
|
||||
else:
|
||||
opts = None
|
||||
|
||||
if trt:
|
||||
trt_ctx_manager.stop_runtime()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
Fire(main)
|
||||
@ -1,390 +0,0 @@
|
||||
import os
|
||||
import re
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
from glob import iglob
|
||||
|
||||
import torch
|
||||
from fire import Fire
|
||||
from transformers import pipeline
|
||||
|
||||
from flux.modules.image_embedders import CannyImageEncoder, DepthImageEncoder
|
||||
from flux.sampling import denoise, get_noise, get_schedule, prepare_control, unpack
|
||||
from flux.util import configs, load_ae, load_clip, load_flow_model, load_t5, save_image
|
||||
|
||||
|
||||
@dataclass
|
||||
class SamplingOptions:
|
||||
prompt: str
|
||||
width: int
|
||||
height: int
|
||||
num_steps: int
|
||||
guidance: float
|
||||
seed: int | None
|
||||
img_cond_path: str
|
||||
lora_scale: float | None
|
||||
|
||||
|
||||
def parse_prompt(options: SamplingOptions) -> SamplingOptions | None:
|
||||
user_question = "Next prompt (write /h for help, /q to quit and leave empty to repeat):\n"
|
||||
usage = (
|
||||
"Usage: Either write your prompt directly, leave this field empty "
|
||||
"to repeat the prompt or write a command starting with a slash:\n"
|
||||
"- '/w <width>' will set the width of the generated image\n"
|
||||
"- '/h <height>' will set the height of the generated image\n"
|
||||
"- '/s <seed>' sets the next seed\n"
|
||||
"- '/g <guidance>' sets the guidance (flux-dev only)\n"
|
||||
"- '/n <steps>' sets the number of steps\n"
|
||||
"- '/q' to quit"
|
||||
)
|
||||
|
||||
while (prompt := input(user_question)).startswith("/"):
|
||||
if prompt.startswith("/w"):
|
||||
if prompt.count(" ") != 1:
|
||||
print(f"Got invalid command '{prompt}'\n{usage}")
|
||||
continue
|
||||
_, width = prompt.split()
|
||||
options.width = 16 * (int(width) // 16)
|
||||
print(
|
||||
f"Setting resolution to {options.width} x {options.height} "
|
||||
f"({options.height * options.width / 1e6:.2f}MP)"
|
||||
)
|
||||
elif prompt.startswith("/h"):
|
||||
if prompt.count(" ") != 1:
|
||||
print(f"Got invalid command '{prompt}'\n{usage}")
|
||||
continue
|
||||
_, height = prompt.split()
|
||||
options.height = 16 * (int(height) // 16)
|
||||
print(
|
||||
f"Setting resolution to {options.width} x {options.height} "
|
||||
f"({options.height * options.width / 1e6:.2f}MP)"
|
||||
)
|
||||
elif prompt.startswith("/g"):
|
||||
if prompt.count(" ") != 1:
|
||||
print(f"Got invalid command '{prompt}'\n{usage}")
|
||||
continue
|
||||
_, guidance = prompt.split()
|
||||
options.guidance = float(guidance)
|
||||
print(f"Setting guidance to {options.guidance}")
|
||||
elif prompt.startswith("/s"):
|
||||
if prompt.count(" ") != 1:
|
||||
print(f"Got invalid command '{prompt}'\n{usage}")
|
||||
continue
|
||||
_, seed = prompt.split()
|
||||
options.seed = int(seed)
|
||||
print(f"Setting seed to {options.seed}")
|
||||
elif prompt.startswith("/n"):
|
||||
if prompt.count(" ") != 1:
|
||||
print(f"Got invalid command '{prompt}'\n{usage}")
|
||||
continue
|
||||
_, steps = prompt.split()
|
||||
options.num_steps = int(steps)
|
||||
print(f"Setting number of steps to {options.num_steps}")
|
||||
elif prompt.startswith("/q"):
|
||||
print("Quitting")
|
||||
return None
|
||||
else:
|
||||
if not prompt.startswith("/h"):
|
||||
print(f"Got invalid command '{prompt}'\n{usage}")
|
||||
print(usage)
|
||||
if prompt != "":
|
||||
options.prompt = prompt
|
||||
return options
|
||||
|
||||
|
||||
def parse_img_cond_path(options: SamplingOptions | None) -> SamplingOptions | None:
|
||||
if options is None:
|
||||
return None
|
||||
|
||||
user_question = "Next conditioning image (write /h for help, /q to quit and leave empty to repeat):\n"
|
||||
usage = (
|
||||
"Usage: Either write your prompt directly, leave this field empty "
|
||||
"to repeat the conditioning image or write a command starting with a slash:\n"
|
||||
"- '/q' to quit"
|
||||
)
|
||||
|
||||
while True:
|
||||
img_cond_path = input(user_question)
|
||||
|
||||
if img_cond_path.startswith("/"):
|
||||
if img_cond_path.startswith("/q"):
|
||||
print("Quitting")
|
||||
return None
|
||||
else:
|
||||
if not img_cond_path.startswith("/h"):
|
||||
print(f"Got invalid command '{img_cond_path}'\n{usage}")
|
||||
print(usage)
|
||||
continue
|
||||
|
||||
if img_cond_path == "":
|
||||
break
|
||||
|
||||
if not os.path.isfile(img_cond_path) or not img_cond_path.lower().endswith(
|
||||
(".jpg", ".jpeg", ".png", ".webp")
|
||||
):
|
||||
print(f"File '{img_cond_path}' does not exist or is not a valid image file")
|
||||
continue
|
||||
|
||||
options.img_cond_path = img_cond_path
|
||||
break
|
||||
|
||||
return options
|
||||
|
||||
|
||||
def parse_lora_scale(options: SamplingOptions | None) -> tuple[SamplingOptions | None, bool]:
|
||||
changed = False
|
||||
|
||||
if options is None:
|
||||
return None, changed
|
||||
|
||||
user_question = "Next lora scale (write /h for help, /q to quit and leave empty to repeat):\n"
|
||||
usage = (
|
||||
"Usage: Either write your prompt directly, leave this field empty "
|
||||
"to repeat the lora scale or write a command starting with a slash:\n"
|
||||
"- '/q' to quit"
|
||||
)
|
||||
|
||||
while (prompt := input(user_question)).startswith("/"):
|
||||
if prompt.startswith("/q"):
|
||||
print("Quitting")
|
||||
return None, changed
|
||||
else:
|
||||
if not prompt.startswith("/h"):
|
||||
print(f"Got invalid command '{prompt}'\n{usage}")
|
||||
print(usage)
|
||||
if prompt != "":
|
||||
options.lora_scale = float(prompt)
|
||||
changed = True
|
||||
return options, changed
|
||||
|
||||
|
||||
@torch.inference_mode()
|
||||
def main(
|
||||
name: str,
|
||||
width: int = 1024,
|
||||
height: int = 1024,
|
||||
seed: int | None = None,
|
||||
prompt: str = "a robot made out of gold",
|
||||
device: str = "cuda" if torch.cuda.is_available() else "cpu",
|
||||
num_steps: int = 50,
|
||||
loop: bool = False,
|
||||
guidance: float | None = None,
|
||||
offload: bool = False,
|
||||
output_dir: str = "output",
|
||||
add_sampling_metadata: bool = True,
|
||||
img_cond_path: str = "assets/robot.webp",
|
||||
lora_scale: float | None = 0.85,
|
||||
trt: bool = False,
|
||||
trt_transformer_precision: str = "bf16",
|
||||
track_usage: bool = False,
|
||||
**kwargs: dict | None,
|
||||
):
|
||||
"""
|
||||
Sample the flux model. Either interactively (set `--loop`) or run for a
|
||||
single image.
|
||||
|
||||
Args:
|
||||
height: height of the sample in pixels (should be a multiple of 16)
|
||||
width: width of the sample in pixels (should be a multiple of 16)
|
||||
seed: Set a seed for sampling
|
||||
output_name: where to save the output image, `{idx}` will be replaced
|
||||
by the index of the sample
|
||||
prompt: Prompt used for sampling
|
||||
device: Pytorch device
|
||||
num_steps: number of sampling steps (default 4 for schnell, 50 for guidance distilled)
|
||||
loop: start an interactive session and sample multiple times
|
||||
guidance: guidance value used for guidance distillation
|
||||
add_sampling_metadata: Add the prompt to the image Exif metadata
|
||||
img_cond_path: path to conditioning image (jpeg/png/webp)
|
||||
trt: use TensorRT backend for optimized inference
|
||||
trt_transformer_precision: specify transformer precision for inference
|
||||
track_usage: track usage of the model for licensing purposes
|
||||
"""
|
||||
nsfw_classifier = pipeline("image-classification", model="Falconsai/nsfw_image_detection", device=device)
|
||||
|
||||
if "lora" in name:
|
||||
assert not trt, "TRT does not support LORA"
|
||||
assert name in [
|
||||
"flux-dev-canny",
|
||||
"flux-dev-depth",
|
||||
"flux-dev-canny-lora",
|
||||
"flux-dev-depth-lora",
|
||||
], f"Got unknown model name: {name}"
|
||||
|
||||
if guidance is None:
|
||||
if name in ["flux-dev-canny", "flux-dev-canny-lora"]:
|
||||
guidance = 30.0
|
||||
elif name in ["flux-dev-depth", "flux-dev-depth-lora"]:
|
||||
guidance = 10.0
|
||||
else:
|
||||
raise NotImplementedError()
|
||||
|
||||
if name not in configs:
|
||||
available = ", ".join(configs.keys())
|
||||
raise ValueError(f"Got unknown model name: {name}, chose from {available}")
|
||||
|
||||
torch_device = torch.device(device)
|
||||
|
||||
output_name = os.path.join(output_dir, "img_{idx}.jpg")
|
||||
if not os.path.exists(output_dir):
|
||||
os.makedirs(output_dir)
|
||||
idx = 0
|
||||
else:
|
||||
fns = [fn for fn in iglob(output_name.format(idx="*")) if re.search(r"img_[0-9]+\.jpg$", fn)]
|
||||
if len(fns) > 0:
|
||||
idx = max(int(fn.split("_")[-1].split(".")[0]) for fn in fns) + 1
|
||||
else:
|
||||
idx = 0
|
||||
|
||||
if name in ["flux-dev-depth", "flux-dev-depth-lora"]:
|
||||
img_embedder = DepthImageEncoder(torch_device)
|
||||
elif name in ["flux-dev-canny", "flux-dev-canny-lora"]:
|
||||
img_embedder = CannyImageEncoder(torch_device)
|
||||
else:
|
||||
raise NotImplementedError()
|
||||
|
||||
if not trt:
|
||||
# init all components
|
||||
t5 = load_t5(torch_device, max_length=512)
|
||||
clip = load_clip(torch_device)
|
||||
model = load_flow_model(name, device="cpu" if offload else torch_device)
|
||||
ae = load_ae(name, device="cpu" if offload else torch_device)
|
||||
else:
|
||||
# lazy import to make install optional
|
||||
from flux.trt.trt_manager import ModuleName, TRTManager
|
||||
|
||||
trt_ctx_manager = TRTManager(
|
||||
trt_transformer_precision=trt_transformer_precision,
|
||||
trt_t5_precision=os.environ.get("TRT_T5_PRECISION", "bf16"),
|
||||
)
|
||||
|
||||
engines = trt_ctx_manager.load_engines(
|
||||
model_name=name,
|
||||
module_names={
|
||||
ModuleName.CLIP,
|
||||
ModuleName.TRANSFORMER,
|
||||
ModuleName.T5,
|
||||
ModuleName.VAE,
|
||||
ModuleName.VAE_ENCODER,
|
||||
},
|
||||
engine_dir=os.environ.get("TRT_ENGINE_DIR", "./engines"),
|
||||
custom_onnx_paths=os.environ.get("CUSTOM_ONNX_PATHS", ""),
|
||||
trt_image_height=height,
|
||||
trt_image_width=width,
|
||||
trt_batch_size=1,
|
||||
trt_static_batch=kwargs.get("static_batch", True),
|
||||
trt_static_shape=kwargs.get("static_shape", True),
|
||||
)
|
||||
|
||||
ae = engines[ModuleName.VAE].to(device="cpu" if offload else torch_device)
|
||||
model = engines[ModuleName.TRANSFORMER].to(device="cpu" if offload else torch_device)
|
||||
clip = engines[ModuleName.CLIP].to(torch_device)
|
||||
t5 = engines[ModuleName.T5].to(device="cpu" if offload else torch_device)
|
||||
|
||||
# set lora scale
|
||||
if "lora" in name and lora_scale is not None:
|
||||
for _, module in model.named_modules():
|
||||
if hasattr(module, "set_scale"):
|
||||
module.set_scale(lora_scale)
|
||||
|
||||
rng = torch.Generator(device="cpu")
|
||||
opts = SamplingOptions(
|
||||
prompt=prompt,
|
||||
width=width,
|
||||
height=height,
|
||||
num_steps=num_steps,
|
||||
guidance=guidance,
|
||||
seed=seed,
|
||||
img_cond_path=img_cond_path,
|
||||
lora_scale=lora_scale,
|
||||
)
|
||||
|
||||
if loop:
|
||||
opts = parse_prompt(opts)
|
||||
opts = parse_img_cond_path(opts)
|
||||
if "lora" in name:
|
||||
opts, changed = parse_lora_scale(opts)
|
||||
if changed:
|
||||
# update the lora scale:
|
||||
for _, module in model.named_modules():
|
||||
if hasattr(module, "set_scale"):
|
||||
module.set_scale(opts.lora_scale)
|
||||
|
||||
while opts is not None:
|
||||
if opts.seed is None:
|
||||
opts.seed = rng.seed()
|
||||
print(f"Generating with seed {opts.seed}:\n{opts.prompt}")
|
||||
t0 = time.perf_counter()
|
||||
|
||||
# prepare input
|
||||
x = get_noise(
|
||||
1,
|
||||
opts.height,
|
||||
opts.width,
|
||||
device=torch_device,
|
||||
dtype=torch.bfloat16,
|
||||
seed=opts.seed,
|
||||
)
|
||||
opts.seed = None
|
||||
if offload:
|
||||
t5, clip, ae = t5.to(torch_device), clip.to(torch_device), ae.to(torch_device)
|
||||
inp = prepare_control(
|
||||
t5,
|
||||
clip,
|
||||
x,
|
||||
prompt=opts.prompt,
|
||||
ae=ae,
|
||||
encoder=img_embedder,
|
||||
img_cond_path=opts.img_cond_path,
|
||||
)
|
||||
timesteps = get_schedule(opts.num_steps, inp["img"].shape[1], shift=(name != "flux-schnell"))
|
||||
|
||||
# offload TEs and AE to CPU, load model to gpu
|
||||
if offload:
|
||||
t5, clip, ae = t5.cpu(), clip.cpu(), ae.cpu()
|
||||
torch.cuda.empty_cache()
|
||||
model = model.to(torch_device)
|
||||
|
||||
# denoise initial noise
|
||||
x = denoise(model, **inp, timesteps=timesteps, guidance=opts.guidance)
|
||||
|
||||
# offload model, load autoencoder to gpu
|
||||
if offload:
|
||||
model.cpu()
|
||||
torch.cuda.empty_cache()
|
||||
ae.decoder.to(x.device)
|
||||
|
||||
# decode latents to pixel space
|
||||
x = unpack(x.float(), opts.height, opts.width)
|
||||
with torch.autocast(device_type=torch_device.type, dtype=torch.bfloat16):
|
||||
x = ae.decode(x)
|
||||
|
||||
if torch.cuda.is_available():
|
||||
torch.cuda.synchronize()
|
||||
t1 = time.perf_counter()
|
||||
print(f"Done in {t1 - t0:.1f}s")
|
||||
|
||||
idx = save_image(
|
||||
nsfw_classifier, name, output_name, idx, x, add_sampling_metadata, prompt, track_usage=track_usage
|
||||
)
|
||||
|
||||
if loop:
|
||||
print("-" * 80)
|
||||
opts = parse_prompt(opts)
|
||||
opts = parse_img_cond_path(opts)
|
||||
if "lora" in name:
|
||||
opts, changed = parse_lora_scale(opts)
|
||||
if changed:
|
||||
# update the lora scale:
|
||||
for _, module in model.named_modules():
|
||||
if hasattr(module, "set_scale"):
|
||||
module.set_scale(opts.lora_scale)
|
||||
else:
|
||||
opts = None
|
||||
|
||||
if trt:
|
||||
trt_ctx_manager.stop_runtime()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
Fire(main)
|
||||
@ -1,334 +0,0 @@
|
||||
import os
|
||||
import re
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
from glob import iglob
|
||||
|
||||
import torch
|
||||
from fire import Fire
|
||||
from PIL import Image
|
||||
from transformers import pipeline
|
||||
|
||||
from flux.sampling import denoise, get_noise, get_schedule, prepare_fill, unpack
|
||||
from flux.util import configs, load_ae, load_clip, load_flow_model, load_t5, save_image
|
||||
|
||||
|
||||
@dataclass
|
||||
class SamplingOptions:
|
||||
prompt: str
|
||||
width: int
|
||||
height: int
|
||||
num_steps: int
|
||||
guidance: float
|
||||
seed: int | None
|
||||
img_cond_path: str
|
||||
img_mask_path: str
|
||||
|
||||
|
||||
def parse_prompt(options: SamplingOptions) -> SamplingOptions | None:
|
||||
user_question = "Next prompt (write /h for help, /q to quit and leave empty to repeat):\n"
|
||||
usage = (
|
||||
"Usage: Either write your prompt directly, leave this field empty "
|
||||
"to repeat the prompt or write a command starting with a slash:\n"
|
||||
"- '/s <seed>' sets the next seed\n"
|
||||
"- '/g <guidance>' sets the guidance (flux-dev only)\n"
|
||||
"- '/n <steps>' sets the number of steps\n"
|
||||
"- '/q' to quit"
|
||||
)
|
||||
|
||||
while (prompt := input(user_question)).startswith("/"):
|
||||
if prompt.startswith("/g"):
|
||||
if prompt.count(" ") != 1:
|
||||
print(f"Got invalid command '{prompt}'\n{usage}")
|
||||
continue
|
||||
_, guidance = prompt.split()
|
||||
options.guidance = float(guidance)
|
||||
print(f"Setting guidance to {options.guidance}")
|
||||
elif prompt.startswith("/s"):
|
||||
if prompt.count(" ") != 1:
|
||||
print(f"Got invalid command '{prompt}'\n{usage}")
|
||||
continue
|
||||
_, seed = prompt.split()
|
||||
options.seed = int(seed)
|
||||
print(f"Setting seed to {options.seed}")
|
||||
elif prompt.startswith("/n"):
|
||||
if prompt.count(" ") != 1:
|
||||
print(f"Got invalid command '{prompt}'\n{usage}")
|
||||
continue
|
||||
_, steps = prompt.split()
|
||||
options.num_steps = int(steps)
|
||||
print(f"Setting number of steps to {options.num_steps}")
|
||||
elif prompt.startswith("/q"):
|
||||
print("Quitting")
|
||||
return None
|
||||
else:
|
||||
if not prompt.startswith("/h"):
|
||||
print(f"Got invalid command '{prompt}'\n{usage}")
|
||||
print(usage)
|
||||
if prompt != "":
|
||||
options.prompt = prompt
|
||||
return options
|
||||
|
||||
|
||||
def parse_img_cond_path(options: SamplingOptions | None) -> SamplingOptions | None:
|
||||
if options is None:
|
||||
return None
|
||||
|
||||
user_question = "Next conditioning image (write /h for help, /q to quit and leave empty to repeat):\n"
|
||||
usage = (
|
||||
"Usage: Either write your prompt directly, leave this field empty "
|
||||
"to repeat the conditioning image or write a command starting with a slash:\n"
|
||||
"- '/q' to quit"
|
||||
)
|
||||
|
||||
while True:
|
||||
img_cond_path = input(user_question)
|
||||
|
||||
if img_cond_path.startswith("/"):
|
||||
if img_cond_path.startswith("/q"):
|
||||
print("Quitting")
|
||||
return None
|
||||
else:
|
||||
if not img_cond_path.startswith("/h"):
|
||||
print(f"Got invalid command '{img_cond_path}'\n{usage}")
|
||||
print(usage)
|
||||
continue
|
||||
|
||||
if img_cond_path == "":
|
||||
break
|
||||
|
||||
if not os.path.isfile(img_cond_path) or not img_cond_path.lower().endswith(
|
||||
(".jpg", ".jpeg", ".png", ".webp")
|
||||
):
|
||||
print(f"File '{img_cond_path}' does not exist or is not a valid image file")
|
||||
continue
|
||||
else:
|
||||
with Image.open(img_cond_path) as img:
|
||||
width, height = img.size
|
||||
|
||||
if width % 32 != 0 or height % 32 != 0:
|
||||
print(f"Image dimensions must be divisible by 32, got {width}x{height}")
|
||||
continue
|
||||
|
||||
options.img_cond_path = img_cond_path
|
||||
break
|
||||
|
||||
return options
|
||||
|
||||
|
||||
def parse_img_mask_path(options: SamplingOptions | None) -> SamplingOptions | None:
|
||||
if options is None:
|
||||
return None
|
||||
|
||||
user_question = "Next conditioning mask (write /h for help, /q to quit and leave empty to repeat):\n"
|
||||
usage = (
|
||||
"Usage: Either write your prompt directly, leave this field empty "
|
||||
"to repeat the conditioning mask or write a command starting with a slash:\n"
|
||||
"- '/q' to quit"
|
||||
)
|
||||
|
||||
while True:
|
||||
img_mask_path = input(user_question)
|
||||
|
||||
if img_mask_path.startswith("/"):
|
||||
if img_mask_path.startswith("/q"):
|
||||
print("Quitting")
|
||||
return None
|
||||
else:
|
||||
if not img_mask_path.startswith("/h"):
|
||||
print(f"Got invalid command '{img_mask_path}'\n{usage}")
|
||||
print(usage)
|
||||
continue
|
||||
|
||||
if img_mask_path == "":
|
||||
break
|
||||
|
||||
if not os.path.isfile(img_mask_path) or not img_mask_path.lower().endswith(
|
||||
(".jpg", ".jpeg", ".png", ".webp")
|
||||
):
|
||||
print(f"File '{img_mask_path}' does not exist or is not a valid image file")
|
||||
continue
|
||||
else:
|
||||
with Image.open(img_mask_path) as img:
|
||||
width, height = img.size
|
||||
|
||||
if width % 32 != 0 or height % 32 != 0:
|
||||
print(f"Image dimensions must be divisible by 32, got {width}x{height}")
|
||||
continue
|
||||
else:
|
||||
with Image.open(options.img_cond_path) as img_cond:
|
||||
img_cond_width, img_cond_height = img_cond.size
|
||||
|
||||
if width != img_cond_width or height != img_cond_height:
|
||||
print(
|
||||
f"Mask dimensions must match conditioning image, got {width}x{height} and {img_cond_width}x{img_cond_height}"
|
||||
)
|
||||
continue
|
||||
|
||||
options.img_mask_path = img_mask_path
|
||||
break
|
||||
|
||||
return options
|
||||
|
||||
|
||||
@torch.inference_mode()
|
||||
def main(
|
||||
seed: int | None = None,
|
||||
prompt: str = "a white paper cup",
|
||||
device: str = "cuda" if torch.cuda.is_available() else "cpu",
|
||||
num_steps: int = 50,
|
||||
loop: bool = False,
|
||||
guidance: float = 30.0,
|
||||
offload: bool = False,
|
||||
output_dir: str = "output",
|
||||
add_sampling_metadata: bool = True,
|
||||
img_cond_path: str = "assets/cup.png",
|
||||
img_mask_path: str = "assets/cup_mask.png",
|
||||
track_usage: bool = False,
|
||||
):
|
||||
"""
|
||||
Sample the flux model. Either interactively (set `--loop`) or run for a
|
||||
single image. This demo assumes that the conditioning image and mask have
|
||||
the same shape and that height and width are divisible by 32.
|
||||
|
||||
Args:
|
||||
seed: Set a seed for sampling
|
||||
output_name: where to save the output image, `{idx}` will be replaced
|
||||
by the index of the sample
|
||||
prompt: Prompt used for sampling
|
||||
device: Pytorch device
|
||||
num_steps: number of sampling steps (default 4 for schnell, 50 for guidance distilled)
|
||||
loop: start an interactive session and sample multiple times
|
||||
guidance: guidance value used for guidance distillation
|
||||
add_sampling_metadata: Add the prompt to the image Exif metadata
|
||||
img_cond_path: path to conditioning image (jpeg/png/webp)
|
||||
img_mask_path: path to conditioning mask (jpeg/png/webp)
|
||||
track_usage: track usage of the model for licensing purposes
|
||||
"""
|
||||
nsfw_classifier = pipeline("image-classification", model="Falconsai/nsfw_image_detection", device=device)
|
||||
|
||||
name = "flux-dev-fill"
|
||||
if name not in configs:
|
||||
available = ", ".join(configs.keys())
|
||||
raise ValueError(f"Got unknown model name: {name}, chose from {available}")
|
||||
|
||||
torch_device = torch.device(device)
|
||||
|
||||
output_name = os.path.join(output_dir, "img_{idx}.jpg")
|
||||
if not os.path.exists(output_dir):
|
||||
os.makedirs(output_dir)
|
||||
idx = 0
|
||||
else:
|
||||
fns = [fn for fn in iglob(output_name.format(idx="*")) if re.search(r"img_[0-9]+\.jpg$", fn)]
|
||||
if len(fns) > 0:
|
||||
idx = max(int(fn.split("_")[-1].split(".")[0]) for fn in fns) + 1
|
||||
else:
|
||||
idx = 0
|
||||
|
||||
# init all components
|
||||
t5 = load_t5(torch_device, max_length=128)
|
||||
clip = load_clip(torch_device)
|
||||
model = load_flow_model(name, device="cpu" if offload else torch_device)
|
||||
ae = load_ae(name, device="cpu" if offload else torch_device)
|
||||
|
||||
rng = torch.Generator(device="cpu")
|
||||
with Image.open(img_cond_path) as img:
|
||||
width, height = img.size
|
||||
opts = SamplingOptions(
|
||||
prompt=prompt,
|
||||
width=width,
|
||||
height=height,
|
||||
num_steps=num_steps,
|
||||
guidance=guidance,
|
||||
seed=seed,
|
||||
img_cond_path=img_cond_path,
|
||||
img_mask_path=img_mask_path,
|
||||
)
|
||||
|
||||
if loop:
|
||||
opts = parse_prompt(opts)
|
||||
opts = parse_img_cond_path(opts)
|
||||
|
||||
with Image.open(opts.img_cond_path) as img:
|
||||
width, height = img.size
|
||||
opts.height = height
|
||||
opts.width = width
|
||||
|
||||
opts = parse_img_mask_path(opts)
|
||||
|
||||
while opts is not None:
|
||||
if opts.seed is None:
|
||||
opts.seed = rng.seed()
|
||||
print(f"Generating with seed {opts.seed}:\n{opts.prompt}")
|
||||
t0 = time.perf_counter()
|
||||
|
||||
# prepare input
|
||||
x = get_noise(
|
||||
1,
|
||||
opts.height,
|
||||
opts.width,
|
||||
device=torch_device,
|
||||
dtype=torch.bfloat16,
|
||||
seed=opts.seed,
|
||||
)
|
||||
opts.seed = None
|
||||
if offload:
|
||||
t5, clip, ae = t5.to(torch_device), clip.to(torch_device), ae.to(torch_device)
|
||||
inp = prepare_fill(
|
||||
t5,
|
||||
clip,
|
||||
x,
|
||||
prompt=opts.prompt,
|
||||
ae=ae,
|
||||
img_cond_path=opts.img_cond_path,
|
||||
mask_path=opts.img_mask_path,
|
||||
)
|
||||
|
||||
timesteps = get_schedule(opts.num_steps, inp["img"].shape[1], shift=(name != "flux-schnell"))
|
||||
|
||||
# offload TEs and AE to CPU, load model to gpu
|
||||
if offload:
|
||||
t5, clip, ae = t5.cpu(), clip.cpu(), ae.cpu()
|
||||
torch.cuda.empty_cache()
|
||||
model = model.to(torch_device)
|
||||
|
||||
# denoise initial noise
|
||||
x = denoise(model, **inp, timesteps=timesteps, guidance=opts.guidance)
|
||||
|
||||
# offload model, load autoencoder to gpu
|
||||
if offload:
|
||||
model.cpu()
|
||||
torch.cuda.empty_cache()
|
||||
ae.decoder.to(x.device)
|
||||
|
||||
# decode latents to pixel space
|
||||
x = unpack(x.float(), opts.height, opts.width)
|
||||
with torch.autocast(device_type=torch_device.type, dtype=torch.bfloat16):
|
||||
x = ae.decode(x)
|
||||
|
||||
if torch.cuda.is_available():
|
||||
torch.cuda.synchronize()
|
||||
t1 = time.perf_counter()
|
||||
print(f"Done in {t1 - t0:.1f}s")
|
||||
|
||||
idx = save_image(
|
||||
nsfw_classifier, name, output_name, idx, x, add_sampling_metadata, prompt, track_usage=track_usage
|
||||
)
|
||||
|
||||
if loop:
|
||||
print("-" * 80)
|
||||
opts = parse_prompt(opts)
|
||||
opts = parse_img_cond_path(opts)
|
||||
|
||||
with Image.open(opts.img_cond_path) as img:
|
||||
width, height = img.size
|
||||
opts.height = height
|
||||
opts.width = width
|
||||
|
||||
opts = parse_img_mask_path(opts)
|
||||
else:
|
||||
opts = None
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
Fire(main)
|
||||
@ -1,368 +0,0 @@
|
||||
import os
|
||||
import re
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
from glob import iglob
|
||||
|
||||
import torch
|
||||
from fire import Fire
|
||||
|
||||
from flux.content_filters import PixtralContentFilter
|
||||
from flux.sampling import denoise, get_schedule, prepare_kontext, unpack
|
||||
from flux.util import (
|
||||
aspect_ratio_to_height_width,
|
||||
check_onnx_access_for_trt,
|
||||
load_ae,
|
||||
load_clip,
|
||||
load_flow_model,
|
||||
load_t5,
|
||||
save_image,
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class SamplingOptions:
|
||||
prompt: str
|
||||
width: int | None
|
||||
height: int | None
|
||||
num_steps: int
|
||||
guidance: float
|
||||
seed: int | None
|
||||
img_cond_path: str
|
||||
|
||||
|
||||
def parse_prompt(options: SamplingOptions) -> SamplingOptions | None:
|
||||
user_question = "Next prompt (write /h for help, /q to quit and leave empty to repeat):\n"
|
||||
usage = (
|
||||
"Usage: Either write your prompt directly, leave this field empty "
|
||||
"to repeat the prompt or write a command starting with a slash:\n"
|
||||
"- '/ar <width>:<height>' will set the aspect ratio of the generated image\n"
|
||||
"- '/s <seed>' sets the next seed\n"
|
||||
"- '/g <guidance>' sets the guidance (flux-dev only)\n"
|
||||
"- '/n <steps>' sets the number of steps\n"
|
||||
"- '/q' to quit"
|
||||
)
|
||||
|
||||
while (prompt := input(user_question)).startswith("/"):
|
||||
if prompt.startswith("/ar"):
|
||||
if prompt.count(" ") != 1:
|
||||
print(f"Got invalid command '{prompt}'\n{usage}")
|
||||
continue
|
||||
_, ratio_prompt = prompt.split()
|
||||
if ratio_prompt == "auto":
|
||||
options.width = None
|
||||
options.height = None
|
||||
print("Setting resolution to input image resolution.")
|
||||
else:
|
||||
options.width, options.height = aspect_ratio_to_height_width(ratio_prompt)
|
||||
print(f"Setting resolution to {options.width} x {options.height}.")
|
||||
elif prompt.startswith("/h"):
|
||||
if prompt.count(" ") != 1:
|
||||
print(f"Got invalid command '{prompt}'\n{usage}")
|
||||
continue
|
||||
_, height = prompt.split()
|
||||
if height == "auto":
|
||||
options.height = None
|
||||
else:
|
||||
options.height = 16 * (int(height) // 16)
|
||||
if options.height is not None and options.width is not None:
|
||||
print(
|
||||
f"Setting resolution to {options.width} x {options.height} "
|
||||
f"({options.height * options.width / 1e6:.2f}MP)"
|
||||
)
|
||||
else:
|
||||
print(f"Setting resolution to {options.width} x {options.height}.")
|
||||
elif prompt.startswith("/g"):
|
||||
if prompt.count(" ") != 1:
|
||||
print(f"Got invalid command '{prompt}'\n{usage}")
|
||||
continue
|
||||
_, guidance = prompt.split()
|
||||
options.guidance = float(guidance)
|
||||
print(f"Setting guidance to {options.guidance}")
|
||||
elif prompt.startswith("/s"):
|
||||
if prompt.count(" ") != 1:
|
||||
print(f"Got invalid command '{prompt}'\n{usage}")
|
||||
continue
|
||||
_, seed = prompt.split()
|
||||
options.seed = int(seed)
|
||||
print(f"Setting seed to {options.seed}")
|
||||
elif prompt.startswith("/n"):
|
||||
if prompt.count(" ") != 1:
|
||||
print(f"Got invalid command '{prompt}'\n{usage}")
|
||||
continue
|
||||
_, steps = prompt.split()
|
||||
options.num_steps = int(steps)
|
||||
print(f"Setting number of steps to {options.num_steps}")
|
||||
elif prompt.startswith("/q"):
|
||||
print("Quitting")
|
||||
return None
|
||||
else:
|
||||
if not prompt.startswith("/h"):
|
||||
print(f"Got invalid command '{prompt}'\n{usage}")
|
||||
print(usage)
|
||||
if prompt != "":
|
||||
options.prompt = prompt
|
||||
return options
|
||||
|
||||
|
||||
def parse_img_cond_path(options: SamplingOptions | None) -> SamplingOptions | None:
|
||||
if options is None:
|
||||
return None
|
||||
|
||||
user_question = "Next input image (write /h for help, /q to quit and leave empty to repeat):\n"
|
||||
usage = (
|
||||
"Usage: Either write a path to an image directly, leave this field empty "
|
||||
"to repeat the last input image or write a command starting with a slash:\n"
|
||||
"- '/q' to quit\n\n"
|
||||
"The input image will be edited by FLUX.1 Kontext creating a new image based"
|
||||
"on your instruction prompt."
|
||||
)
|
||||
|
||||
while True:
|
||||
img_cond_path = input(user_question)
|
||||
|
||||
if img_cond_path.startswith("/"):
|
||||
if img_cond_path.startswith("/q"):
|
||||
print("Quitting")
|
||||
return None
|
||||
else:
|
||||
if not img_cond_path.startswith("/h"):
|
||||
print(f"Got invalid command '{img_cond_path}'\n{usage}")
|
||||
print(usage)
|
||||
continue
|
||||
|
||||
if img_cond_path == "":
|
||||
break
|
||||
|
||||
if not os.path.isfile(img_cond_path) or not img_cond_path.lower().endswith(
|
||||
(".jpg", ".jpeg", ".png", ".webp")
|
||||
):
|
||||
print(f"File '{img_cond_path}' does not exist or is not a valid image file")
|
||||
continue
|
||||
|
||||
options.img_cond_path = img_cond_path
|
||||
break
|
||||
|
||||
return options
|
||||
|
||||
|
||||
@torch.inference_mode()
|
||||
def main(
|
||||
name: str = "flux-dev-kontext",
|
||||
aspect_ratio: str | None = None,
|
||||
seed: int | None = None,
|
||||
prompt: str = "replace the logo with the text 'Black Forest Labs'",
|
||||
device: str = "cuda" if torch.cuda.is_available() else "cpu",
|
||||
num_steps: int = 30,
|
||||
loop: bool = False,
|
||||
guidance: float = 2.5,
|
||||
offload: bool = False,
|
||||
output_dir: str = "output",
|
||||
add_sampling_metadata: bool = True,
|
||||
img_cond_path: str = "assets/cup.png",
|
||||
trt: bool = False,
|
||||
trt_transformer_precision: str = "bf16",
|
||||
track_usage: bool = False,
|
||||
):
|
||||
"""
|
||||
Sample the flux model. Either interactively (set `--loop`) or run for a
|
||||
single image.
|
||||
|
||||
Args:
|
||||
height: height of the sample in pixels (should be a multiple of 16), None
|
||||
defaults to the size of the conditioning
|
||||
width: width of the sample in pixels (should be a multiple of 16), None
|
||||
defaults to the size of the conditioning
|
||||
seed: Set a seed for sampling
|
||||
output_name: where to save the output image, `{idx}` will be replaced
|
||||
by the index of the sample
|
||||
prompt: Prompt used for sampling
|
||||
device: Pytorch device
|
||||
num_steps: number of sampling steps (default 4 for schnell, 50 for guidance distilled)
|
||||
loop: start an interactive session and sample multiple times
|
||||
guidance: guidance value used for guidance distillation
|
||||
add_sampling_metadata: Add the prompt to the image Exif metadata
|
||||
img_cond_path: path to conditioning image (jpeg/png/webp)
|
||||
trt: use TensorRT backend for optimized inference
|
||||
track_usage: track usage of the model for licensing purposes
|
||||
"""
|
||||
assert name == "flux-dev-kontext", f"Got unknown model name: {name}"
|
||||
|
||||
torch_device = torch.device(device)
|
||||
|
||||
output_name = os.path.join(output_dir, "img_{idx}.jpg")
|
||||
if not os.path.exists(output_dir):
|
||||
os.makedirs(output_dir)
|
||||
idx = 0
|
||||
else:
|
||||
fns = [fn for fn in iglob(output_name.format(idx="*")) if re.search(r"img_[0-9]+\.jpg$", fn)]
|
||||
if len(fns) > 0:
|
||||
idx = max(int(fn.split("_")[-1].split(".")[0]) for fn in fns) + 1
|
||||
else:
|
||||
idx = 0
|
||||
|
||||
if aspect_ratio is None:
|
||||
width = None
|
||||
height = None
|
||||
else:
|
||||
width, height = aspect_ratio_to_height_width(aspect_ratio)
|
||||
|
||||
if not trt:
|
||||
t5 = load_t5(torch_device, max_length=512)
|
||||
clip = load_clip(torch_device)
|
||||
model = load_flow_model(name, device="cpu" if offload else torch_device)
|
||||
else:
|
||||
# lazy import to make install optional
|
||||
from flux.trt.trt_manager import ModuleName, TRTManager
|
||||
|
||||
# Check if we need ONNX model access (which requires authentication for FLUX models)
|
||||
onnx_dir = check_onnx_access_for_trt(name, trt_transformer_precision)
|
||||
|
||||
trt_ctx_manager = TRTManager(
|
||||
trt_transformer_precision=trt_transformer_precision,
|
||||
trt_t5_precision=os.environ.get("TRT_T5_PRECISION", "bf16"),
|
||||
)
|
||||
engines = trt_ctx_manager.load_engines(
|
||||
model_name=name,
|
||||
module_names={
|
||||
ModuleName.CLIP,
|
||||
ModuleName.TRANSFORMER,
|
||||
ModuleName.T5,
|
||||
},
|
||||
engine_dir=os.environ.get("TRT_ENGINE_DIR", "./engines"),
|
||||
custom_onnx_paths=onnx_dir or os.environ.get("CUSTOM_ONNX_PATHS", ""),
|
||||
trt_image_height=height,
|
||||
trt_image_width=width,
|
||||
trt_batch_size=1,
|
||||
trt_timing_cache=os.getenv("TRT_TIMING_CACHE_FILE", None),
|
||||
trt_static_batch=False,
|
||||
trt_static_shape=False,
|
||||
)
|
||||
|
||||
model = engines[ModuleName.TRANSFORMER].to(device="cpu" if offload else torch_device)
|
||||
clip = engines[ModuleName.CLIP].to(torch_device)
|
||||
t5 = engines[ModuleName.T5].to(device="cpu" if offload else torch_device)
|
||||
|
||||
ae = load_ae(name, device="cpu" if offload else torch_device)
|
||||
content_filter = PixtralContentFilter(torch.device("cpu"))
|
||||
|
||||
rng = torch.Generator(device="cpu")
|
||||
opts = SamplingOptions(
|
||||
prompt=prompt,
|
||||
width=width,
|
||||
height=height,
|
||||
num_steps=num_steps,
|
||||
guidance=guidance,
|
||||
seed=seed,
|
||||
img_cond_path=img_cond_path,
|
||||
)
|
||||
|
||||
if loop:
|
||||
opts = parse_prompt(opts)
|
||||
opts = parse_img_cond_path(opts)
|
||||
|
||||
while opts is not None:
|
||||
if opts.seed is None:
|
||||
opts.seed = rng.seed()
|
||||
print(f"Generating with seed {opts.seed}:\n{opts.prompt}")
|
||||
t0 = time.perf_counter()
|
||||
|
||||
if content_filter.test_txt(opts.prompt):
|
||||
print("Your prompt has been automatically flagged. Please choose another prompt.")
|
||||
if loop:
|
||||
print("-" * 80)
|
||||
opts = parse_prompt(opts)
|
||||
opts = parse_img_cond_path(opts)
|
||||
else:
|
||||
opts = None
|
||||
continue
|
||||
if content_filter.test_image(opts.img_cond_path):
|
||||
print("Your input image has been automatically flagged. Please choose another image.")
|
||||
if loop:
|
||||
print("-" * 80)
|
||||
opts = parse_prompt(opts)
|
||||
opts = parse_img_cond_path(opts)
|
||||
else:
|
||||
opts = None
|
||||
continue
|
||||
|
||||
if offload:
|
||||
t5, clip, ae = t5.to(torch_device), clip.to(torch_device), ae.to(torch_device)
|
||||
inp, height, width = prepare_kontext(
|
||||
t5=t5,
|
||||
clip=clip,
|
||||
prompt=opts.prompt,
|
||||
ae=ae,
|
||||
img_cond_path=opts.img_cond_path,
|
||||
target_width=opts.width,
|
||||
target_height=opts.height,
|
||||
bs=1,
|
||||
seed=opts.seed,
|
||||
device=torch_device,
|
||||
)
|
||||
from safetensors.torch import save_file
|
||||
|
||||
save_file({k: v.cpu().contiguous() for k, v in inp.items()}, "output/noise.sft")
|
||||
inp.pop("img_cond_orig")
|
||||
opts.seed = None
|
||||
timesteps = get_schedule(opts.num_steps, inp["img"].shape[1], shift=(name != "flux-schnell"))
|
||||
|
||||
# offload TEs and AE to CPU, load model to gpu
|
||||
if offload:
|
||||
t5, clip, ae = t5.cpu(), clip.cpu(), ae.cpu()
|
||||
torch.cuda.empty_cache()
|
||||
model = model.to(torch_device)
|
||||
|
||||
# denoise initial noise
|
||||
t00 = time.time()
|
||||
x = denoise(model, **inp, timesteps=timesteps, guidance=opts.guidance)
|
||||
torch.cuda.synchronize()
|
||||
t01 = time.time()
|
||||
print(f"Denoising took {t01 - t00:.3f}s")
|
||||
|
||||
# offload model, load autoencoder to gpu
|
||||
if offload:
|
||||
model.cpu()
|
||||
torch.cuda.empty_cache()
|
||||
ae.decoder.to(x.device)
|
||||
|
||||
# decode latents to pixel space
|
||||
x = unpack(x.float(), height, width)
|
||||
with torch.autocast(device_type=torch_device.type, dtype=torch.bfloat16):
|
||||
ae_dev_t0 = time.perf_counter()
|
||||
x = ae.decode(x)
|
||||
torch.cuda.synchronize()
|
||||
ae_dev_t1 = time.perf_counter()
|
||||
print(f"AE decode took {ae_dev_t1 - ae_dev_t0:.3f}s")
|
||||
|
||||
if content_filter.test_image(x.cpu()):
|
||||
print(
|
||||
"Your output image has been automatically flagged. Choose another prompt/image or try again."
|
||||
)
|
||||
if loop:
|
||||
print("-" * 80)
|
||||
opts = parse_prompt(opts)
|
||||
opts = parse_img_cond_path(opts)
|
||||
else:
|
||||
opts = None
|
||||
continue
|
||||
|
||||
if torch.cuda.is_available():
|
||||
torch.cuda.synchronize()
|
||||
t1 = time.perf_counter()
|
||||
print(f"Done in {t1 - t0:.1f}s")
|
||||
|
||||
idx = save_image(
|
||||
None, name, output_name, idx, x, add_sampling_metadata, prompt, track_usage=track_usage
|
||||
)
|
||||
|
||||
if loop:
|
||||
print("-" * 80)
|
||||
opts = parse_prompt(opts)
|
||||
opts = parse_img_cond_path(opts)
|
||||
else:
|
||||
opts = None
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
Fire(main)
|
||||
@ -1,290 +0,0 @@
|
||||
import os
|
||||
import re
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
from glob import iglob
|
||||
|
||||
import torch
|
||||
from fire import Fire
|
||||
from transformers import pipeline
|
||||
|
||||
from flux.modules.image_embedders import ReduxImageEncoder
|
||||
from flux.sampling import denoise, get_noise, get_schedule, prepare_redux, unpack
|
||||
from flux.util import (
|
||||
get_checkpoint_path,
|
||||
load_ae,
|
||||
load_clip,
|
||||
load_flow_model,
|
||||
load_t5,
|
||||
save_image,
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class SamplingOptions:
|
||||
prompt: str
|
||||
width: int
|
||||
height: int
|
||||
num_steps: int
|
||||
guidance: float
|
||||
seed: int | None
|
||||
img_cond_path: str
|
||||
|
||||
|
||||
def parse_prompt(options: SamplingOptions) -> SamplingOptions | None:
|
||||
user_question = "Write /h for help, /q to quit and leave empty to repeat):\n"
|
||||
usage = (
|
||||
"Usage: Leave this field empty to do nothing "
|
||||
"or write a command starting with a slash:\n"
|
||||
"- '/w <width>' will set the width of the generated image\n"
|
||||
"- '/h <height>' will set the height of the generated image\n"
|
||||
"- '/s <seed>' sets the next seed\n"
|
||||
"- '/g <guidance>' sets the guidance (flux-dev only)\n"
|
||||
"- '/n <steps>' sets the number of steps\n"
|
||||
"- '/q' to quit"
|
||||
)
|
||||
|
||||
while (prompt := input(user_question)).startswith("/"):
|
||||
if prompt.startswith("/w"):
|
||||
if prompt.count(" ") != 1:
|
||||
print(f"Got invalid command '{prompt}'\n{usage}")
|
||||
continue
|
||||
_, width = prompt.split()
|
||||
options.width = 16 * (int(width) // 16)
|
||||
print(
|
||||
f"Setting resolution to {options.width} x {options.height} "
|
||||
f"({options.height * options.width / 1e6:.2f}MP)"
|
||||
)
|
||||
elif prompt.startswith("/h"):
|
||||
if prompt.count(" ") != 1:
|
||||
print(f"Got invalid command '{prompt}'\n{usage}")
|
||||
continue
|
||||
_, height = prompt.split()
|
||||
options.height = 16 * (int(height) // 16)
|
||||
print(
|
||||
f"Setting resolution to {options.width} x {options.height} "
|
||||
f"({options.height * options.width / 1e6:.2f}MP)"
|
||||
)
|
||||
elif prompt.startswith("/g"):
|
||||
if prompt.count(" ") != 1:
|
||||
print(f"Got invalid command '{prompt}'\n{usage}")
|
||||
continue
|
||||
_, guidance = prompt.split()
|
||||
options.guidance = float(guidance)
|
||||
print(f"Setting guidance to {options.guidance}")
|
||||
elif prompt.startswith("/s"):
|
||||
if prompt.count(" ") != 1:
|
||||
print(f"Got invalid command '{prompt}'\n{usage}")
|
||||
continue
|
||||
_, seed = prompt.split()
|
||||
options.seed = int(seed)
|
||||
print(f"Setting seed to {options.seed}")
|
||||
elif prompt.startswith("/n"):
|
||||
if prompt.count(" ") != 1:
|
||||
print(f"Got invalid command '{prompt}'\n{usage}")
|
||||
continue
|
||||
_, steps = prompt.split()
|
||||
options.num_steps = int(steps)
|
||||
print(f"Setting number of steps to {options.num_steps}")
|
||||
elif prompt.startswith("/q"):
|
||||
print("Quitting")
|
||||
return None
|
||||
else:
|
||||
if not prompt.startswith("/h"):
|
||||
print(f"Got invalid command '{prompt}'\n{usage}")
|
||||
print(usage)
|
||||
return options
|
||||
|
||||
|
||||
def parse_img_cond_path(options: SamplingOptions | None) -> SamplingOptions | None:
|
||||
if options is None:
|
||||
return None
|
||||
|
||||
user_question = "Next conditioning image (write /h for help, /q to quit and leave empty to repeat):\n"
|
||||
usage = (
|
||||
"Usage: Either write your prompt directly, leave this field empty "
|
||||
"to repeat the conditioning image or write a command starting with a slash:\n"
|
||||
"- '/q' to quit"
|
||||
)
|
||||
|
||||
while True:
|
||||
img_cond_path = input(user_question)
|
||||
|
||||
if img_cond_path.startswith("/"):
|
||||
if img_cond_path.startswith("/q"):
|
||||
print("Quitting")
|
||||
return None
|
||||
else:
|
||||
if not img_cond_path.startswith("/h"):
|
||||
print(f"Got invalid command '{img_cond_path}'\n{usage}")
|
||||
print(usage)
|
||||
continue
|
||||
|
||||
if img_cond_path == "":
|
||||
break
|
||||
|
||||
if not os.path.isfile(img_cond_path) or not img_cond_path.lower().endswith(
|
||||
(".jpg", ".jpeg", ".png", ".webp")
|
||||
):
|
||||
print(f"File '{img_cond_path}' does not exist or is not a valid image file")
|
||||
continue
|
||||
|
||||
options.img_cond_path = img_cond_path
|
||||
break
|
||||
|
||||
return options
|
||||
|
||||
|
||||
@torch.inference_mode()
|
||||
def main(
|
||||
name: str = "flux-dev",
|
||||
width: int = 1360,
|
||||
height: int = 768,
|
||||
seed: int | None = None,
|
||||
device: str = "cuda" if torch.cuda.is_available() else "cpu",
|
||||
num_steps: int | None = None,
|
||||
loop: bool = False,
|
||||
guidance: float = 2.5,
|
||||
offload: bool = False,
|
||||
output_dir: str = "output",
|
||||
add_sampling_metadata: bool = True,
|
||||
img_cond_path: str = "assets/robot.webp",
|
||||
track_usage: bool = False,
|
||||
):
|
||||
"""
|
||||
Sample the flux model. Either interactively (set `--loop`) or run for a
|
||||
single image.
|
||||
|
||||
Args:
|
||||
name: Name of the base model to use (either 'flux-dev' or 'flux-schnell')
|
||||
height: height of the sample in pixels (should be a multiple of 16)
|
||||
width: width of the sample in pixels (should be a multiple of 16)
|
||||
seed: Set a seed for sampling
|
||||
device: Pytorch device
|
||||
num_steps: number of sampling steps (default 4 for schnell, 50 for guidance distilled)
|
||||
loop: start an interactive session and sample multiple times
|
||||
guidance: guidance value used for guidance distillation
|
||||
offload: offload models to CPU when not in use
|
||||
output_dir: where to save the output images
|
||||
add_sampling_metadata: Add the prompt to the image Exif metadata
|
||||
img_cond_path: path to conditioning image (jpeg/png/webp)
|
||||
track_usage: track usage of the model for licensing purposes
|
||||
"""
|
||||
|
||||
nsfw_classifier = pipeline("image-classification", model="Falconsai/nsfw_image_detection", device=device)
|
||||
|
||||
if name not in (available := ["flux-dev", "flux-schnell"]):
|
||||
raise ValueError(f"Got unknown model name: {name}, chose from {available}")
|
||||
|
||||
torch_device = torch.device(device)
|
||||
if num_steps is None:
|
||||
num_steps = 4 if name == "flux-schnell" else 50
|
||||
|
||||
output_name = os.path.join(output_dir, "img_{idx}.jpg")
|
||||
if not os.path.exists(output_dir):
|
||||
os.makedirs(output_dir)
|
||||
idx = 0
|
||||
else:
|
||||
fns = [fn for fn in iglob(output_name.format(idx="*")) if re.search(r"img_[0-9]+\.jpg$", fn)]
|
||||
if len(fns) > 0:
|
||||
idx = max(int(fn.split("_")[-1].split(".")[0]) for fn in fns) + 1
|
||||
else:
|
||||
idx = 0
|
||||
|
||||
# init all components
|
||||
t5 = load_t5(torch_device, max_length=256 if name == "flux-schnell" else 512)
|
||||
clip = load_clip(torch_device)
|
||||
model = load_flow_model(name, device="cpu" if offload else torch_device)
|
||||
ae = load_ae(name, device="cpu" if offload else torch_device)
|
||||
|
||||
# Download and initialize the Redux adapter
|
||||
redux_path = str(
|
||||
get_checkpoint_path("black-forest-labs/FLUX.1-Redux-dev", "flux1-redux-dev.safetensors", "FLUX_REDUX")
|
||||
)
|
||||
img_embedder = ReduxImageEncoder(torch_device, redux_path=redux_path)
|
||||
|
||||
rng = torch.Generator(device="cpu")
|
||||
prompt = ""
|
||||
opts = SamplingOptions(
|
||||
prompt=prompt,
|
||||
width=width,
|
||||
height=height,
|
||||
num_steps=num_steps,
|
||||
guidance=guidance,
|
||||
seed=seed,
|
||||
img_cond_path=img_cond_path,
|
||||
)
|
||||
|
||||
if loop:
|
||||
opts = parse_prompt(opts)
|
||||
opts = parse_img_cond_path(opts)
|
||||
|
||||
while opts is not None:
|
||||
if opts.seed is None:
|
||||
opts.seed = rng.seed()
|
||||
print(f"Generating with seed {opts.seed}:\n{opts.prompt}")
|
||||
t0 = time.perf_counter()
|
||||
|
||||
# prepare input
|
||||
x = get_noise(
|
||||
1,
|
||||
opts.height,
|
||||
opts.width,
|
||||
device=torch_device,
|
||||
dtype=torch.bfloat16,
|
||||
seed=opts.seed,
|
||||
)
|
||||
opts.seed = None
|
||||
if offload:
|
||||
ae = ae.cpu()
|
||||
torch.cuda.empty_cache()
|
||||
t5, clip = t5.to(torch_device), clip.to(torch_device)
|
||||
inp = prepare_redux(
|
||||
t5,
|
||||
clip,
|
||||
x,
|
||||
prompt=opts.prompt,
|
||||
encoder=img_embedder,
|
||||
img_cond_path=opts.img_cond_path,
|
||||
)
|
||||
timesteps = get_schedule(opts.num_steps, inp["img"].shape[1], shift=(name != "flux-schnell"))
|
||||
|
||||
# offload TEs to CPU, load model to gpu
|
||||
if offload:
|
||||
t5, clip = t5.cpu(), clip.cpu()
|
||||
torch.cuda.empty_cache()
|
||||
model = model.to(torch_device)
|
||||
|
||||
# denoise initial noise
|
||||
x = denoise(model, **inp, timesteps=timesteps, guidance=opts.guidance)
|
||||
|
||||
# offload model, load autoencoder to gpu
|
||||
if offload:
|
||||
model.cpu()
|
||||
torch.cuda.empty_cache()
|
||||
ae.decoder.to(x.device)
|
||||
|
||||
# decode latents to pixel space
|
||||
x = unpack(x.float(), opts.height, opts.width)
|
||||
with torch.autocast(device_type=torch_device.type, dtype=torch.bfloat16):
|
||||
x = ae.decode(x)
|
||||
|
||||
if torch.cuda.is_available():
|
||||
torch.cuda.synchronize()
|
||||
t1 = time.perf_counter()
|
||||
print(f"Done in {t1 - t0:.1f}s")
|
||||
|
||||
idx = save_image(
|
||||
nsfw_classifier, name, output_name, idx, x, add_sampling_metadata, prompt, track_usage=track_usage
|
||||
)
|
||||
|
||||
if loop:
|
||||
print("-" * 80)
|
||||
opts = parse_prompt(opts)
|
||||
opts = parse_img_cond_path(opts)
|
||||
else:
|
||||
opts = None
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
Fire(main)
|
||||
@ -1,171 +0,0 @@
|
||||
import torch
|
||||
from einops import rearrange
|
||||
from PIL import Image
|
||||
from transformers import AutoProcessor, LlavaForConditionalGeneration, pipeline
|
||||
|
||||
PROMPT_IMAGE_INTEGRITY = """
|
||||
Task: Analyze an image to identify potential copyright concerns or depictions of public figures.
|
||||
|
||||
Output: Respond with only "yes" or "no"
|
||||
|
||||
Criteria for "yes":
|
||||
- The image contains a recognizable character from copyrighted media (movies, TV, comics, games, etc.)
|
||||
- The image displays a trademarked logo or brand
|
||||
- The image depicts a recognizable public figure (celebrities, politicians, athletes, influencers, historical figures, etc.)
|
||||
|
||||
Criteria for "no":
|
||||
- All other cases
|
||||
- When you cannot identify the specific copyrighted work or named individual
|
||||
|
||||
Critical Requirements:
|
||||
1. You must be able to name the exact copyrighted work or specific person depicted
|
||||
2. General references to demographics or characteristics are not sufficient
|
||||
3. Base your decision solely on visual content, not interpretation
|
||||
4. Provide only the one-word answer: "yes" or "no"
|
||||
""".strip()
|
||||
|
||||
|
||||
PROMPT_IMAGE_INTEGRITY_FOLLOW_UP = "Does this image have copyright concerns or includes public figures?"
|
||||
|
||||
PROMPT_TEXT_INTEGRITY = """
|
||||
Task: Analyze a text prompt to identify potential copyright concerns or requests to depict living public figures.
|
||||
|
||||
Output: Respond with only "yes" or "no"
|
||||
|
||||
Criteria for "Yes":
|
||||
- The prompt explicitly names a character from copyrighted media (movies, TV, comics, games, etc.)
|
||||
- The prompt explicitly mentions a trademarked logo or brand
|
||||
- The prompt names or describes a specific living public figure (celebrities, politicians, athletes, influencers, etc.)
|
||||
|
||||
Criteria for "No":
|
||||
- All other cases
|
||||
- When you cannot identify the specific copyrighted work or named individual
|
||||
|
||||
Critical Requirements:
|
||||
1. You must be able to name the exact copyrighted work or specific person referenced
|
||||
2. General demographic descriptions or characteristics are not sufficient
|
||||
3. Analyze only the prompt text, not potential image outcomes
|
||||
4. Provide only the one-word answer: "yes" or "no"
|
||||
|
||||
The prompt to check is:
|
||||
-----
|
||||
{prompt}
|
||||
-----
|
||||
|
||||
Does this prompt have copyright concerns or includes public figures?
|
||||
""".strip()
|
||||
|
||||
|
||||
class PixtralContentFilter(torch.nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
device: torch.device = torch.device("cpu"),
|
||||
nsfw_threshold: float = 0.85,
|
||||
):
|
||||
super().__init__()
|
||||
|
||||
model_id = "mistral-community/pixtral-12b"
|
||||
self.processor = AutoProcessor.from_pretrained(model_id)
|
||||
self.model = LlavaForConditionalGeneration.from_pretrained(model_id, device_map=device)
|
||||
|
||||
self.yes_token, self.no_token = self.processor.tokenizer.encode(["yes", "no"])
|
||||
|
||||
self.nsfw_classifier = pipeline(
|
||||
"image-classification", model="Falconsai/nsfw_image_detection", device=device
|
||||
)
|
||||
self.nsfw_threshold = nsfw_threshold
|
||||
|
||||
def yes_no_logit_processor(
|
||||
self, input_ids: torch.LongTensor, scores: torch.FloatTensor
|
||||
) -> torch.FloatTensor:
|
||||
"""
|
||||
Sets all tokens but yes/no to the minimum.
|
||||
"""
|
||||
scores_yes_token = scores[:, self.yes_token].clone()
|
||||
scores_no_token = scores[:, self.no_token].clone()
|
||||
scores_min = scores.min()
|
||||
scores[:, :] = scores_min - 1
|
||||
scores[:, self.yes_token] = scores_yes_token
|
||||
scores[:, self.no_token] = scores_no_token
|
||||
return scores
|
||||
|
||||
def test_image(self, image: Image.Image | str | torch.Tensor) -> bool:
|
||||
if isinstance(image, torch.Tensor):
|
||||
image = rearrange(image[0].clamp(-1.0, 1.0), "c h w -> h w c")
|
||||
image = Image.fromarray((127.5 * (image + 1.0)).cpu().byte().numpy())
|
||||
elif isinstance(image, str):
|
||||
image = Image.open(image)
|
||||
|
||||
classification = next(c for c in self.nsfw_classifier(image) if c["label"] == "nsfw")
|
||||
if classification["score"] > self.nsfw_threshold:
|
||||
return True
|
||||
|
||||
# 512^2 pixels are enough for checking
|
||||
w, h = image.size
|
||||
f = (512**2 / (w * h)) ** 0.5
|
||||
image = image.resize((int(f * w), int(f * h)))
|
||||
|
||||
chat = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"content": PROMPT_IMAGE_INTEGRITY,
|
||||
},
|
||||
{
|
||||
"type": "image",
|
||||
"image": image,
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"content": PROMPT_IMAGE_INTEGRITY_FOLLOW_UP,
|
||||
},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
inputs = self.processor.apply_chat_template(
|
||||
chat,
|
||||
add_generation_prompt=True,
|
||||
tokenize=True,
|
||||
return_dict=True,
|
||||
return_tensors="pt",
|
||||
).to(self.model.device)
|
||||
|
||||
generate_ids = self.model.generate(
|
||||
**inputs,
|
||||
max_new_tokens=1,
|
||||
logits_processor=[self.yes_no_logit_processor],
|
||||
do_sample=False,
|
||||
)
|
||||
return generate_ids[0, -1].item() == self.yes_token
|
||||
|
||||
def test_txt(self, txt: str) -> bool:
|
||||
chat = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"content": PROMPT_TEXT_INTEGRITY.format(prompt=txt),
|
||||
},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
inputs = self.processor.apply_chat_template(
|
||||
chat,
|
||||
add_generation_prompt=True,
|
||||
tokenize=True,
|
||||
return_dict=True,
|
||||
return_tensors="pt",
|
||||
).to(self.model.device)
|
||||
|
||||
generate_ids = self.model.generate(
|
||||
**inputs,
|
||||
max_new_tokens=1,
|
||||
logits_processor=[self.yes_no_logit_processor],
|
||||
do_sample=False,
|
||||
)
|
||||
return generate_ids[0, -1].item() == self.yes_token
|
||||
682
i2v_inference.py
@ -1,682 +0,0 @@
|
||||
import os
|
||||
import time
|
||||
import argparse
|
||||
import json
|
||||
import torch
|
||||
import traceback
|
||||
import gc
|
||||
import random
|
||||
|
||||
# These imports rely on your existing code structure
|
||||
# They must match the location of your WAN code, etc.
|
||||
import wan
|
||||
from wan.configs import MAX_AREA_CONFIGS, WAN_CONFIGS
|
||||
from wan.modules.attention import get_attention_modes
|
||||
from wan.utils.utils import cache_video
|
||||
from mmgp import offload, safetensors2, profile_type
|
||||
|
||||
try:
|
||||
import triton
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
DATA_DIR = "ckpts"
|
||||
|
||||
# --------------------------------------------------
|
||||
# HELPER FUNCTIONS
|
||||
# --------------------------------------------------
|
||||
|
||||
def sanitize_file_name(file_name):
|
||||
"""Clean up file name from special chars."""
|
||||
return (
|
||||
file_name.replace("/", "")
|
||||
.replace("\\", "")
|
||||
.replace(":", "")
|
||||
.replace("|", "")
|
||||
.replace("?", "")
|
||||
.replace("<", "")
|
||||
.replace(">", "")
|
||||
.replace('"', "")
|
||||
)
|
||||
|
||||
def extract_preset(lset_name, lora_dir, loras):
|
||||
"""
|
||||
Load a .lset JSON that lists the LoRA files to apply, plus multipliers
|
||||
and possibly a suggested prompt prefix.
|
||||
"""
|
||||
lset_name = sanitize_file_name(lset_name)
|
||||
if not lset_name.endswith(".lset"):
|
||||
lset_name_filename = os.path.join(lora_dir, lset_name + ".lset")
|
||||
else:
|
||||
lset_name_filename = os.path.join(lora_dir, lset_name)
|
||||
|
||||
if not os.path.isfile(lset_name_filename):
|
||||
raise ValueError(f"Preset '{lset_name}' not found in {lora_dir}")
|
||||
|
||||
with open(lset_name_filename, "r", encoding="utf-8") as reader:
|
||||
text = reader.read()
|
||||
lset = json.loads(text)
|
||||
|
||||
loras_choices_files = lset["loras"]
|
||||
loras_choices = []
|
||||
missing_loras = []
|
||||
for lora_file in loras_choices_files:
|
||||
# Build absolute path and see if it is in loras
|
||||
full_lora_path = os.path.join(lora_dir, lora_file)
|
||||
if full_lora_path in loras:
|
||||
idx = loras.index(full_lora_path)
|
||||
loras_choices.append(str(idx))
|
||||
else:
|
||||
missing_loras.append(lora_file)
|
||||
|
||||
if len(missing_loras) > 0:
|
||||
missing_list = ", ".join(missing_loras)
|
||||
raise ValueError(f"Missing LoRA files for preset: {missing_list}")
|
||||
|
||||
loras_mult_choices = lset["loras_mult"]
|
||||
prompt_prefix = lset.get("prompt", "")
|
||||
full_prompt = lset.get("full_prompt", False)
|
||||
return loras_choices, loras_mult_choices, prompt_prefix, full_prompt
|
||||
|
||||
def get_attention_mode(args_attention, installed_modes):
|
||||
"""
|
||||
Decide which attention mode to use: either the user choice or auto fallback.
|
||||
"""
|
||||
if args_attention == "auto":
|
||||
for candidate in ["sage2", "sage", "sdpa"]:
|
||||
if candidate in installed_modes:
|
||||
return candidate
|
||||
return "sdpa" # last fallback
|
||||
elif args_attention in installed_modes:
|
||||
return args_attention
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Requested attention mode '{args_attention}' not installed. "
|
||||
f"Installed modes: {installed_modes}"
|
||||
)
|
||||
|
||||
def load_i2v_model(model_filename, text_encoder_filename, is_720p):
|
||||
"""
|
||||
Load the i2v model with a specific size config and text encoder.
|
||||
"""
|
||||
if is_720p:
|
||||
print("Loading 14B-720p i2v model ...")
|
||||
cfg = WAN_CONFIGS['i2v-14B']
|
||||
wan_model = wan.WanI2V(
|
||||
config=cfg,
|
||||
checkpoint_dir=DATA_DIR,
|
||||
model_filename=model_filename,
|
||||
text_encoder_filename=text_encoder_filename
|
||||
)
|
||||
else:
|
||||
print("Loading 14B-480p i2v model ...")
|
||||
cfg = WAN_CONFIGS['i2v-14B']
|
||||
wan_model = wan.WanI2V(
|
||||
config=cfg,
|
||||
checkpoint_dir=DATA_DIR,
|
||||
model_filename=model_filename,
|
||||
text_encoder_filename=text_encoder_filename
|
||||
)
|
||||
# Pipe structure
|
||||
pipe = {
|
||||
"transformer": wan_model.model,
|
||||
"text_encoder": wan_model.text_encoder.model,
|
||||
"text_encoder_2": wan_model.clip.model,
|
||||
"vae": wan_model.vae.model
|
||||
}
|
||||
return wan_model, pipe
|
||||
|
||||
def setup_loras(pipe, lora_dir, lora_preset, num_inference_steps):
|
||||
"""
|
||||
Load loras from a directory, optionally apply a preset.
|
||||
"""
|
||||
from pathlib import Path
|
||||
import glob
|
||||
|
||||
if not lora_dir or not Path(lora_dir).is_dir():
|
||||
print("No valid --lora-dir provided or directory doesn't exist, skipping LoRA setup.")
|
||||
return [], [], [], "", "", False
|
||||
|
||||
# Gather LoRA files
|
||||
loras = sorted(
|
||||
glob.glob(os.path.join(lora_dir, "*.sft"))
|
||||
+ glob.glob(os.path.join(lora_dir, "*.safetensors"))
|
||||
)
|
||||
loras_names = [Path(x).stem for x in loras]
|
||||
|
||||
# Offload them with no activation
|
||||
offload.load_loras_into_model(pipe["transformer"], loras, activate_all_loras=False)
|
||||
|
||||
# If user gave a preset, apply it
|
||||
default_loras_choices = []
|
||||
default_loras_multis_str = ""
|
||||
default_prompt_prefix = ""
|
||||
preset_applied_full_prompt = False
|
||||
if lora_preset:
|
||||
loras_choices, loras_mult, prefix, full_prompt = extract_preset(lora_preset, lora_dir, loras)
|
||||
default_loras_choices = loras_choices
|
||||
# If user stored loras_mult as a list or string in JSON, unify that to str
|
||||
if isinstance(loras_mult, list):
|
||||
# Just store them in a single line
|
||||
default_loras_multis_str = " ".join([str(x) for x in loras_mult])
|
||||
else:
|
||||
default_loras_multis_str = str(loras_mult)
|
||||
default_prompt_prefix = prefix
|
||||
preset_applied_full_prompt = full_prompt
|
||||
|
||||
return (
|
||||
loras,
|
||||
loras_names,
|
||||
default_loras_choices,
|
||||
default_loras_multis_str,
|
||||
default_prompt_prefix,
|
||||
preset_applied_full_prompt
|
||||
)
|
||||
|
||||
def parse_loras_and_activate(
|
||||
transformer,
|
||||
loras,
|
||||
loras_choices,
|
||||
loras_mult_str,
|
||||
num_inference_steps
|
||||
):
|
||||
"""
|
||||
Activate the chosen LoRAs with multipliers over the pipeline's transformer.
|
||||
Supports stepwise expansions (like "0.5,0.8" for partial steps).
|
||||
"""
|
||||
if not loras or not loras_choices:
|
||||
# no LoRAs selected
|
||||
return
|
||||
|
||||
# Handle multipliers
|
||||
def is_float_or_comma_list(x):
|
||||
"""
|
||||
Example: "0.5", or "0.8,1.0", etc. is valid.
|
||||
"""
|
||||
if not x:
|
||||
return False
|
||||
for chunk in x.split(","):
|
||||
try:
|
||||
float(chunk.strip())
|
||||
except ValueError:
|
||||
return False
|
||||
return True
|
||||
|
||||
# Convert multiline or spaced lines to a single list
|
||||
lines = [
|
||||
line.strip()
|
||||
for line in loras_mult_str.replace("\r", "\n").split("\n")
|
||||
if line.strip() and not line.strip().startswith("#")
|
||||
]
|
||||
# Now combine them by space
|
||||
joined_line = " ".join(lines) # "1.0 2.0,3.0"
|
||||
if not joined_line.strip():
|
||||
multipliers = []
|
||||
else:
|
||||
multipliers = joined_line.split(" ")
|
||||
|
||||
# Expand each item
|
||||
final_multipliers = []
|
||||
for mult in multipliers:
|
||||
mult = mult.strip()
|
||||
if not mult:
|
||||
continue
|
||||
if is_float_or_comma_list(mult):
|
||||
# Could be "0.7" or "0.5,0.6"
|
||||
if "," in mult:
|
||||
# expand over steps
|
||||
chunk_vals = [float(x.strip()) for x in mult.split(",")]
|
||||
expanded = expand_list_over_steps(chunk_vals, num_inference_steps)
|
||||
final_multipliers.append(expanded)
|
||||
else:
|
||||
final_multipliers.append(float(mult))
|
||||
else:
|
||||
raise ValueError(f"Invalid LoRA multiplier: '{mult}'")
|
||||
|
||||
# If fewer multipliers than chosen LoRAs => pad with 1.0
|
||||
needed = len(loras_choices) - len(final_multipliers)
|
||||
if needed > 0:
|
||||
final_multipliers += [1.0]*needed
|
||||
|
||||
# Actually activate them
|
||||
offload.activate_loras(transformer, loras_choices, final_multipliers)
|
||||
|
||||
def expand_list_over_steps(short_list, num_steps):
|
||||
"""
|
||||
If user gave (0.5, 0.8) for example, expand them over `num_steps`.
|
||||
The expansion is simply linear slice across steps.
|
||||
"""
|
||||
result = []
|
||||
inc = len(short_list) / float(num_steps)
|
||||
idxf = 0.0
|
||||
for _ in range(num_steps):
|
||||
value = short_list[int(idxf)]
|
||||
result.append(value)
|
||||
idxf += inc
|
||||
return result
|
||||
|
||||
def download_models_if_needed(transformer_filename_i2v, text_encoder_filename, local_folder=DATA_DIR):
|
||||
"""
|
||||
Checks if all required WAN 2.1 i2v files exist locally under 'ckpts/'.
|
||||
If not, downloads them from a Hugging Face Hub repo.
|
||||
Adjust the 'repo_id' and needed files as appropriate.
|
||||
"""
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
try:
|
||||
from huggingface_hub import hf_hub_download, snapshot_download
|
||||
except ImportError as e:
|
||||
raise ImportError(
|
||||
"huggingface_hub is required for automatic model download. "
|
||||
"Please install it via `pip install huggingface_hub`."
|
||||
) from e
|
||||
|
||||
# Identify just the filename portion for each path
|
||||
def basename(path_str):
|
||||
return os.path.basename(path_str)
|
||||
|
||||
repo_id = "DeepBeepMeep/Wan2.1"
|
||||
target_root = local_folder
|
||||
|
||||
# You can customize this list as needed for i2v usage.
|
||||
# At minimum you need:
|
||||
# 1) The requested i2v transformer file
|
||||
# 2) The requested text encoder file
|
||||
# 3) VAE file
|
||||
# 4) The open-clip xlm-roberta-large weights
|
||||
#
|
||||
# If your i2v config references additional files, add them here.
|
||||
needed_files = [
|
||||
"Wan2.1_VAE.pth",
|
||||
"models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth",
|
||||
basename(text_encoder_filename),
|
||||
basename(transformer_filename_i2v),
|
||||
]
|
||||
|
||||
# The original script also downloads an entire "xlm-roberta-large" folder
|
||||
# via snapshot_download. If you require that for your pipeline,
|
||||
# you can add it here, for example:
|
||||
subfolder_name = "xlm-roberta-large"
|
||||
if not Path(os.path.join(target_root, subfolder_name)).exists():
|
||||
snapshot_download(repo_id=repo_id, allow_patterns=subfolder_name + "/*", local_dir=target_root)
|
||||
|
||||
for filename in needed_files:
|
||||
local_path = os.path.join(target_root, filename)
|
||||
if not os.path.isfile(local_path):
|
||||
print(f"File '{filename}' not found locally. Downloading from {repo_id} ...")
|
||||
hf_hub_download(
|
||||
repo_id=repo_id,
|
||||
filename=filename,
|
||||
local_dir=target_root
|
||||
)
|
||||
else:
|
||||
# Already present
|
||||
pass
|
||||
|
||||
print("All required i2v files are present.")
|
||||
|
||||
|
||||
# --------------------------------------------------
|
||||
# ARGUMENT PARSER
|
||||
# --------------------------------------------------
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Image-to-Video inference using WAN 2.1 i2v"
|
||||
)
|
||||
# Model + Tools
|
||||
parser.add_argument(
|
||||
"--quantize-transformer",
|
||||
action="store_true",
|
||||
help="Use on-the-fly transformer quantization"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--compile",
|
||||
action="store_true",
|
||||
help="Enable PyTorch 2.0 compile for the transformer"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--attention",
|
||||
type=str,
|
||||
default="auto",
|
||||
help="Which attention to use: auto, sdpa, sage, sage2, flash"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--profile",
|
||||
type=int,
|
||||
default=4,
|
||||
help="Memory usage profile number [1..5]; see original script or use 2 if you have low VRAM"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--preload",
|
||||
type=int,
|
||||
default=0,
|
||||
help="Megabytes of the diffusion model to preload in VRAM (only used in some profiles)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--verbose",
|
||||
type=int,
|
||||
default=1,
|
||||
help="Verbosity level [0..5]"
|
||||
)
|
||||
|
||||
# i2v Model
|
||||
parser.add_argument(
|
||||
"--transformer-file",
|
||||
type=str,
|
||||
default=f"{DATA_DIR}/wan2.1_image2video_480p_14B_quanto_int8.safetensors",
|
||||
help="Which i2v model to load"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--text-encoder-file",
|
||||
type=str,
|
||||
default=f"{DATA_DIR}/models_t5_umt5-xxl-enc-quanto_int8.safetensors",
|
||||
help="Which text encoder to use"
|
||||
)
|
||||
|
||||
# LoRA
|
||||
parser.add_argument(
|
||||
"--lora-dir",
|
||||
type=str,
|
||||
default="",
|
||||
help="Path to a directory containing i2v LoRAs"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--lora-preset",
|
||||
type=str,
|
||||
default="",
|
||||
help="A .lset preset name in the lora_dir to auto-apply"
|
||||
)
|
||||
|
||||
# Generation Options
|
||||
parser.add_argument("--prompt", type=str, default=None, required=True, help="Prompt for generation")
|
||||
parser.add_argument("--negative-prompt", type=str, default="", help="Negative prompt")
|
||||
parser.add_argument("--resolution", type=str, default="832x480", help="WxH")
|
||||
parser.add_argument("--frames", type=int, default=64, help="Number of frames (16=1s if fps=16). Must be multiple of 4 +/- 1 in WAN.")
|
||||
parser.add_argument("--steps", type=int, default=30, help="Number of denoising steps.")
|
||||
parser.add_argument("--guidance-scale", type=float, default=5.0, help="Classifier-free guidance scale")
|
||||
parser.add_argument("--flow-shift", type=float, default=3.0, help="Flow shift parameter. Generally 3.0 for 480p, 5.0 for 720p.")
|
||||
parser.add_argument("--riflex", action="store_true", help="Enable RIFLEx for longer videos")
|
||||
parser.add_argument("--teacache", type=float, default=0.25, help="TeaCache multiplier, e.g. 0.5, 2.0, etc.")
|
||||
parser.add_argument("--teacache-start", type=float, default=0.1, help="Teacache start step percentage [0..100]")
|
||||
parser.add_argument("--seed", type=int, default=-1, help="Random seed. -1 means random each time.")
|
||||
parser.add_argument("--slg-layers", type=str, default=None, help="Which layers to use for skip layer guidance")
|
||||
parser.add_argument("--slg-start", type=float, default=0.0, help="Percentage in to start SLG")
|
||||
parser.add_argument("--slg-end", type=float, default=1.0, help="Percentage in to end SLG")
|
||||
|
||||
# LoRA usage
|
||||
parser.add_argument("--loras-choices", type=str, default="", help="Comma-separated list of chosen LoRA indices or preset names to load. Usually you only use the preset.")
|
||||
parser.add_argument("--loras-mult", type=str, default="", help="Multipliers for each chosen LoRA. Example: '1.0 1.2,1.3' etc.")
|
||||
|
||||
# Input
|
||||
parser.add_argument(
|
||||
"--input-image",
|
||||
type=str,
|
||||
default=None,
|
||||
required=True,
|
||||
help="Path to an input image (or multiple)."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output-file",
|
||||
type=str,
|
||||
default="output.mp4",
|
||||
help="Where to save the resulting video."
|
||||
)
|
||||
|
||||
return parser.parse_args()
|
||||
|
||||
# --------------------------------------------------
|
||||
# MAIN
|
||||
# --------------------------------------------------
|
||||
|
||||
def main():
|
||||
args = parse_args()
|
||||
|
||||
# Setup environment
|
||||
offload.default_verboseLevel = args.verbose
|
||||
installed_attn_modes = get_attention_modes()
|
||||
|
||||
# Decide attention
|
||||
chosen_attention = get_attention_mode(args.attention, installed_attn_modes)
|
||||
offload.shared_state["_attention"] = chosen_attention
|
||||
|
||||
# Determine i2v resolution format
|
||||
if "720" in args.transformer_file:
|
||||
is_720p = True
|
||||
else:
|
||||
is_720p = False
|
||||
|
||||
# Make sure we have the needed models locally
|
||||
download_models_if_needed(args.transformer_file, args.text_encoder_file)
|
||||
|
||||
# Load i2v
|
||||
wan_model, pipe = load_i2v_model(
|
||||
model_filename=args.transformer_file,
|
||||
text_encoder_filename=args.text_encoder_file,
|
||||
is_720p=is_720p
|
||||
)
|
||||
wan_model._interrupt = False
|
||||
|
||||
# Offload / profile
|
||||
# e.g. for your script: offload.profile(pipe, profile_no=args.profile, compile=..., quantizeTransformer=...)
|
||||
# pass the budgets if you want, etc.
|
||||
kwargs = {}
|
||||
if args.profile == 2 or args.profile == 4:
|
||||
# preload is in MB
|
||||
if args.preload == 0:
|
||||
budgets = {"transformer": 100, "text_encoder": 100, "*": 1000}
|
||||
else:
|
||||
budgets = {"transformer": args.preload, "text_encoder": 100, "*": 1000}
|
||||
kwargs["budgets"] = budgets
|
||||
elif args.profile == 3:
|
||||
kwargs["budgets"] = {"*": "70%"}
|
||||
|
||||
compile_choice = "transformer" if args.compile else ""
|
||||
# Create the offload object
|
||||
offloadobj = offload.profile(
|
||||
pipe,
|
||||
profile_no=args.profile,
|
||||
compile=compile_choice,
|
||||
quantizeTransformer=args.quantize_transformer,
|
||||
**kwargs
|
||||
)
|
||||
|
||||
# If user wants to use LoRAs
|
||||
(
|
||||
loras,
|
||||
loras_names,
|
||||
default_loras_choices,
|
||||
default_loras_multis_str,
|
||||
preset_prompt_prefix,
|
||||
preset_full_prompt
|
||||
) = setup_loras(pipe, args.lora_dir, args.lora_preset, args.steps)
|
||||
|
||||
# Combine user prompt with preset prompt if the preset indicates so
|
||||
if preset_prompt_prefix:
|
||||
if preset_full_prompt:
|
||||
# Full override
|
||||
user_prompt = preset_prompt_prefix
|
||||
else:
|
||||
# Just prefix
|
||||
user_prompt = preset_prompt_prefix + "\n" + args.prompt
|
||||
else:
|
||||
user_prompt = args.prompt
|
||||
|
||||
# Actually parse user LoRA choices if they did not rely purely on the preset
|
||||
if args.loras_choices:
|
||||
# If user gave e.g. "0,1", we treat that as new additions
|
||||
lora_choice_list = [x.strip() for x in args.loras_choices.split(",")]
|
||||
else:
|
||||
# Use the defaults from the preset
|
||||
lora_choice_list = default_loras_choices
|
||||
|
||||
# Activate them
|
||||
parse_loras_and_activate(
|
||||
pipe["transformer"], loras, lora_choice_list, args.loras_mult or default_loras_multis_str, args.steps
|
||||
)
|
||||
|
||||
# Negative prompt
|
||||
negative_prompt = args.negative_prompt or ""
|
||||
|
||||
# Sanity check resolution
|
||||
if "*" in args.resolution.lower():
|
||||
print("ERROR: resolution must be e.g. 832x480 not '832*480'. Fixing it.")
|
||||
resolution_str = args.resolution.lower().replace("*", "x")
|
||||
else:
|
||||
resolution_str = args.resolution
|
||||
|
||||
try:
|
||||
width, height = [int(x) for x in resolution_str.split("x")]
|
||||
except:
|
||||
raise ValueError(f"Invalid resolution: '{resolution_str}'")
|
||||
|
||||
# Parse slg_layers from comma-separated string to a Python list of ints (or None if not provided)
|
||||
if args.slg_layers:
|
||||
slg_list = [int(x) for x in args.slg_layers.split(",")]
|
||||
else:
|
||||
slg_list = None
|
||||
|
||||
# Additional checks (from your original code).
|
||||
if "480p" in args.transformer_file:
|
||||
# Then we cannot exceed certain area for 480p model
|
||||
if width * height > 832*480:
|
||||
raise ValueError("You must use the 720p i2v model to generate bigger than 832x480.")
|
||||
# etc.
|
||||
|
||||
# Handle random seed
|
||||
if args.seed < 0:
|
||||
args.seed = random.randint(0, 999999999)
|
||||
print(f"Using seed={args.seed}")
|
||||
|
||||
# Setup tea cache if needed
|
||||
trans = wan_model.model
|
||||
trans.enable_cache = (args.teacache > 0)
|
||||
if trans.enable_cache:
|
||||
if "480p" in args.transformer_file:
|
||||
# example from your code
|
||||
trans.coefficients = [-3.02331670e+02, 2.23948934e+02, -5.25463970e+01, 5.87348440e+00, -2.01973289e-01]
|
||||
elif "720p" in args.transformer_file:
|
||||
trans.coefficients = [-114.36346466, 65.26524496, -18.82220707, 4.91518089, -0.23412683]
|
||||
else:
|
||||
raise ValueError("Teacache not supported for this model variant")
|
||||
|
||||
# Attempt generation
|
||||
print("Starting generation ...")
|
||||
start_time = time.time()
|
||||
|
||||
# Read the input image
|
||||
if not os.path.isfile(args.input_image):
|
||||
raise ValueError(f"Input image does not exist: {args.input_image}")
|
||||
|
||||
from PIL import Image
|
||||
input_img = Image.open(args.input_image).convert("RGB")
|
||||
|
||||
# Possibly load more than one image if you want "multiple images" – but here we'll just do single for demonstration
|
||||
|
||||
# Define the generation call
|
||||
# - frames => must be multiple of 4 plus 1 as per original script's note, e.g. 81, 65, ...
|
||||
# You can correct to that if needed:
|
||||
frame_count = (args.frames // 4)*4 + 1 # ensures it's 4*N+1
|
||||
# RIFLEx
|
||||
enable_riflex = args.riflex
|
||||
|
||||
# If teacache => reset counters
|
||||
if trans.enable_cache:
|
||||
trans.teacache_counter = 0
|
||||
trans.cache_multiplier = args.teacache
|
||||
trans.cache_start_step = int(args.teacache_start * args.steps / 100.0)
|
||||
trans.num_steps = args.steps
|
||||
trans.cache_skipped_steps = 0
|
||||
trans.previous_residual_uncond = None
|
||||
trans.previous_residual_cond = None
|
||||
|
||||
# VAE Tiling
|
||||
device_mem_capacity = torch.cuda.get_device_properties(0).total_memory / 1048576
|
||||
if device_mem_capacity >= 28000: # 81 frames 720p requires about 28 GB VRAM
|
||||
use_vae_config = 1
|
||||
elif device_mem_capacity >= 8000:
|
||||
use_vae_config = 2
|
||||
else:
|
||||
use_vae_config = 3
|
||||
|
||||
if use_vae_config == 1:
|
||||
VAE_tile_size = 0
|
||||
elif use_vae_config == 2:
|
||||
VAE_tile_size = 256
|
||||
else:
|
||||
VAE_tile_size = 128
|
||||
|
||||
print('Using VAE tile size of', VAE_tile_size)
|
||||
|
||||
# Actually run the i2v generation
|
||||
try:
|
||||
sample_frames = wan_model.generate(
|
||||
input_prompt = user_prompt,
|
||||
image_start = input_img,
|
||||
frame_num=frame_count,
|
||||
width=width,
|
||||
height=height,
|
||||
# max_area=MAX_AREA_CONFIGS[f"{width}*{height}"], # or you can pass your custom
|
||||
shift=args.flow_shift,
|
||||
sampling_steps=args.steps,
|
||||
guide_scale=args.guidance_scale,
|
||||
n_prompt=negative_prompt,
|
||||
seed=args.seed,
|
||||
offload_model=False,
|
||||
callback=None, # or define your own callback if you want
|
||||
enable_RIFLEx=enable_riflex,
|
||||
VAE_tile_size=VAE_tile_size,
|
||||
joint_pass=slg_list is None, # set if you want a small speed improvement without SLG
|
||||
slg_layers=slg_list,
|
||||
slg_start=args.slg_start,
|
||||
slg_end=args.slg_end,
|
||||
)
|
||||
except Exception as e:
|
||||
offloadobj.unload_all()
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
err_str = f"Generation failed with error: {e}"
|
||||
# Attempt to detect OOM errors
|
||||
s = str(e).lower()
|
||||
if any(keyword in s for keyword in ["memory", "cuda", "alloc"]):
|
||||
raise RuntimeError("Likely out-of-VRAM or out-of-RAM error. " + err_str)
|
||||
else:
|
||||
traceback.print_exc()
|
||||
raise RuntimeError(err_str)
|
||||
|
||||
# After generation
|
||||
offloadobj.unload_all()
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
if sample_frames is None:
|
||||
raise RuntimeError("No frames were returned (maybe generation was aborted or failed).")
|
||||
|
||||
# If teacache was used, we can see how many steps were skipped
|
||||
if trans.enable_cache:
|
||||
print(f"TeaCache skipped steps: {trans.teacache_skipped_steps} / {args.steps}")
|
||||
|
||||
# Save result
|
||||
sample_frames = sample_frames.cpu() # shape = c, t, h, w => [3, T, H, W]
|
||||
os.makedirs(os.path.dirname(args.output_file) or ".", exist_ok=True)
|
||||
|
||||
# Use the provided helper from your code to store the MP4
|
||||
# By default, you used cache_video(tensor=..., save_file=..., fps=16, ...)
|
||||
# or you can do your own. We'll do the same for consistency:
|
||||
cache_video(
|
||||
tensor=sample_frames[None], # shape => [1, c, T, H, W]
|
||||
save_file=args.output_file,
|
||||
fps=16,
|
||||
nrow=1,
|
||||
normalize=True,
|
||||
value_range=(-1, 1)
|
||||
)
|
||||
|
||||
end_time = time.time()
|
||||
elapsed_s = end_time - start_time
|
||||
print(f"Done! Output written to {args.output_file}. Generation time: {elapsed_s:.1f} seconds.")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
1
loras_qwen/Readme.txt
Normal file
@ -0,0 +1 @@
|
||||
LTX Video loras
|
||||
2
models/flux/__init__.py
Normal file
@ -0,0 +1,2 @@
|
||||
from .flux_main import model_factory
|
||||
from . import flux_handler
|
||||
121
models/flux/flux_handler.py
Normal file
@ -0,0 +1,121 @@
|
||||
import torch
|
||||
|
||||
def get_ltxv_text_encoder_filename(text_encoder_quantization):
|
||||
text_encoder_filename = "ckpts/T5_xxl_1.1/T5_xxl_1.1_enc_bf16.safetensors"
|
||||
if text_encoder_quantization =="int8":
|
||||
text_encoder_filename = text_encoder_filename.replace("bf16", "quanto_bf16_int8")
|
||||
return text_encoder_filename
|
||||
|
||||
class family_handler():
|
||||
@staticmethod
|
||||
def query_model_def(base_model_type, model_def):
|
||||
flux_model = model_def.get("flux-model", "flux-dev")
|
||||
flux_schnell = flux_model == "flux-schnell"
|
||||
flux_chroma = flux_model == "flux-chroma"
|
||||
flux_uso = flux_model == "flux-dev-uso"
|
||||
model_def_output = {
|
||||
"image_outputs" : True,
|
||||
"no_negative_prompt" : not flux_chroma,
|
||||
}
|
||||
if flux_chroma:
|
||||
model_def_output["guidance_max_phases"] = 1
|
||||
elif not flux_schnell:
|
||||
model_def_output["embedded_guidance"] = True
|
||||
if flux_uso :
|
||||
model_def_output["any_image_refs_relative_size"] = True
|
||||
model_def_output["no_background_removal"] = True
|
||||
|
||||
model_def_output["image_ref_choices"] = {
|
||||
"choices":[("No Reference Image", ""),("First Image is a Reference Image, and then the next ones (up to two) are Style Images", "I"),
|
||||
("Up to two Images are Style Images", "IJ")],
|
||||
"default": "I",
|
||||
"letters_filter": "IJ",
|
||||
"label": "Reference Images / Style Images"
|
||||
}
|
||||
|
||||
return model_def_output
|
||||
|
||||
@staticmethod
|
||||
def query_supported_types():
|
||||
return ["flux"]
|
||||
|
||||
@staticmethod
|
||||
def query_family_maps():
|
||||
return {}, {}
|
||||
|
||||
@staticmethod
|
||||
def get_rgb_factors(base_model_type ):
|
||||
from shared.RGB_factors import get_rgb_factors
|
||||
latent_rgb_factors, latent_rgb_factors_bias = get_rgb_factors("flux")
|
||||
return latent_rgb_factors, latent_rgb_factors_bias
|
||||
|
||||
|
||||
@staticmethod
|
||||
def query_model_family():
|
||||
return "flux"
|
||||
|
||||
@staticmethod
|
||||
def query_family_infos():
|
||||
return {"flux":(30, "Flux 1")}
|
||||
|
||||
@staticmethod
|
||||
def query_model_files(computeList, base_model_type, model_filename, text_encoder_quantization):
|
||||
text_encoder_filename = get_ltxv_text_encoder_filename(text_encoder_quantization)
|
||||
return [
|
||||
{
|
||||
"repoId" : "DeepBeepMeep/Flux",
|
||||
"sourceFolderList" : ["siglip-so400m-patch14-384", "",],
|
||||
"fileList" : [ ["config.json", "preprocessor_config.json", "model.safetensors"], ["flux_vae.safetensors"] ]
|
||||
},
|
||||
{
|
||||
"repoId" : "DeepBeepMeep/LTX_Video",
|
||||
"sourceFolderList" : ["T5_xxl_1.1"],
|
||||
"fileList" : [ ["added_tokens.json", "special_tokens_map.json", "spiece.model", "tokenizer_config.json"] + computeList(text_encoder_filename) ]
|
||||
},
|
||||
{
|
||||
"repoId" : "DeepBeepMeep/HunyuanVideo",
|
||||
"sourceFolderList" : [ "clip_vit_large_patch14", ],
|
||||
"fileList" :[
|
||||
["config.json", "merges.txt", "model.safetensors", "preprocessor_config.json", "special_tokens_map.json", "tokenizer.json", "tokenizer_config.json", "vocab.json"],
|
||||
]
|
||||
}
|
||||
]
|
||||
|
||||
@staticmethod
|
||||
def load_model(model_filename, model_type, base_model_type, model_def, quantizeTransformer = False, text_encoder_quantization = None, dtype = torch.bfloat16, VAE_dtype = torch.float32, mixed_precision_transformer = False, save_quantized = False):
|
||||
from .flux_main import model_factory
|
||||
|
||||
flux_model = model_factory(
|
||||
checkpoint_dir="ckpts",
|
||||
model_filename=model_filename,
|
||||
model_type = model_type,
|
||||
model_def = model_def,
|
||||
base_model_type=base_model_type,
|
||||
text_encoder_filename= get_ltxv_text_encoder_filename(text_encoder_quantization),
|
||||
quantizeTransformer = quantizeTransformer,
|
||||
dtype = dtype,
|
||||
VAE_dtype = VAE_dtype,
|
||||
mixed_precision_transformer = mixed_precision_transformer,
|
||||
save_quantized = save_quantized
|
||||
)
|
||||
|
||||
pipe = { "transformer": flux_model.model, "vae" : flux_model.vae, "text_encoder" : flux_model.clip, "text_encoder_2" : flux_model.t5}
|
||||
|
||||
if flux_model.vision_encoder is not None:
|
||||
pipe["siglip_model"] = flux_model.vision_encoder
|
||||
if flux_model.feature_embedder is not None:
|
||||
pipe["feature_embedder"] = flux_model.feature_embedder
|
||||
return flux_model, pipe
|
||||
|
||||
@staticmethod
|
||||
def update_default_settings(base_model_type, model_def, ui_defaults):
|
||||
flux_model = model_def.get("flux-model", "flux-dev")
|
||||
flux_uso = flux_model == "flux-dev-uso"
|
||||
ui_defaults.update({
|
||||
"embedded_guidance": 2.5,
|
||||
})
|
||||
if model_def.get("reference_image", False):
|
||||
ui_defaults.update({
|
||||
"video_prompt_type": "I" if flux_uso else "KI",
|
||||
})
|
||||
|
||||
221
models/flux/flux_main.py
Normal file
@ -0,0 +1,221 @@
|
||||
import os
|
||||
import re
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
from glob import iglob
|
||||
from mmgp import offload as offload
|
||||
import torch
|
||||
from shared.utils.utils import calculate_new_dimensions
|
||||
from .sampling import denoise, get_schedule, prepare_kontext, prepare_prompt, prepare_multi_ip, unpack
|
||||
from .modules.layers import get_linear_split_map
|
||||
from transformers import SiglipVisionModel, SiglipImageProcessor
|
||||
|
||||
from .util import (
|
||||
aspect_ratio_to_height_width,
|
||||
load_ae,
|
||||
load_clip,
|
||||
load_flow_model,
|
||||
load_t5,
|
||||
save_image,
|
||||
)
|
||||
|
||||
from PIL import Image
|
||||
|
||||
def stitch_images(img1, img2):
|
||||
# Resize img2 to match img1's height
|
||||
width1, height1 = img1.size
|
||||
width2, height2 = img2.size
|
||||
new_width2 = int(width2 * height1 / height2)
|
||||
img2_resized = img2.resize((new_width2, height1), Image.Resampling.LANCZOS)
|
||||
|
||||
stitched = Image.new('RGB', (width1 + new_width2, height1))
|
||||
stitched.paste(img1, (0, 0))
|
||||
stitched.paste(img2_resized, (width1, 0))
|
||||
return stitched
|
||||
|
||||
class model_factory:
|
||||
def __init__(
|
||||
self,
|
||||
checkpoint_dir,
|
||||
model_filename = None,
|
||||
model_type = None,
|
||||
model_def = None,
|
||||
base_model_type = None,
|
||||
text_encoder_filename = None,
|
||||
quantizeTransformer = False,
|
||||
save_quantized = False,
|
||||
dtype = torch.bfloat16,
|
||||
VAE_dtype = torch.float32,
|
||||
mixed_precision_transformer = False
|
||||
):
|
||||
self.device = torch.device(f"cuda")
|
||||
self.VAE_dtype = VAE_dtype
|
||||
self.dtype = dtype
|
||||
torch_device = "cpu"
|
||||
self.guidance_max_phases = model_def.get("guidance_max_phases", 0)
|
||||
|
||||
# model_filename = ["c:/temp/flux1-schnell.safetensors"]
|
||||
|
||||
self.t5 = load_t5(torch_device, text_encoder_filename, max_length=512)
|
||||
self.clip = load_clip(torch_device)
|
||||
self.name = model_def.get("flux-model", "flux-dev")
|
||||
# self.name= "flux-dev-kontext"
|
||||
# self.name= "flux-dev"
|
||||
# self.name= "flux-schnell"
|
||||
source = model_def.get("source", None)
|
||||
self.model = load_flow_model(self.name, model_filename[0] if source is None else source, torch_device)
|
||||
|
||||
self.vae = load_ae(self.name, device=torch_device)
|
||||
|
||||
siglip_processor = siglip_model = feature_embedder = None
|
||||
if self.name == 'flux-dev-uso':
|
||||
siglip_path = "ckpts/siglip-so400m-patch14-384"
|
||||
siglip_processor = SiglipImageProcessor.from_pretrained(siglip_path)
|
||||
siglip_model = SiglipVisionModel.from_pretrained(siglip_path)
|
||||
siglip_model.eval().to("cpu")
|
||||
if len(model_filename) > 1:
|
||||
from .modules.layers import SigLIPMultiFeatProjModel
|
||||
feature_embedder = SigLIPMultiFeatProjModel(
|
||||
siglip_token_nums=729,
|
||||
style_token_nums=64,
|
||||
siglip_token_dims=1152,
|
||||
hidden_size=3072, #self.hidden_size,
|
||||
context_layer_norm=True,
|
||||
)
|
||||
offload.load_model_data(feature_embedder, model_filename[1])
|
||||
self.vision_encoder = siglip_model
|
||||
self.vision_encoder_processor = siglip_processor
|
||||
self.feature_embedder = feature_embedder
|
||||
|
||||
# offload.change_dtype(self.model, dtype, True)
|
||||
# offload.save_model(self.model, "flux-dev.safetensors")
|
||||
|
||||
if not source is None:
|
||||
from wgp import save_model
|
||||
save_model(self.model, model_type, dtype, None)
|
||||
|
||||
if save_quantized:
|
||||
from wgp import save_quantized_model
|
||||
save_quantized_model(self.model, model_type, model_filename[0], dtype, None)
|
||||
|
||||
split_linear_modules_map = get_linear_split_map()
|
||||
self.model.split_linear_modules_map = split_linear_modules_map
|
||||
offload.split_linear_modules(self.model, split_linear_modules_map )
|
||||
|
||||
|
||||
def generate(
|
||||
self,
|
||||
seed: int | None = None,
|
||||
input_prompt: str = "replace the logo with the text 'Black Forest Labs'",
|
||||
n_prompt: str = None,
|
||||
sampling_steps: int = 20,
|
||||
input_ref_images = None,
|
||||
width= 832,
|
||||
height=480,
|
||||
embedded_guidance_scale: float = 2.5,
|
||||
guide_scale = 2.5,
|
||||
fit_into_canvas = None,
|
||||
callback = None,
|
||||
loras_slists = None,
|
||||
batch_size = 1,
|
||||
video_prompt_type = "",
|
||||
joint_pass = False,
|
||||
image_refs_relative_size = 100,
|
||||
**bbargs
|
||||
):
|
||||
if self._interrupt:
|
||||
return None
|
||||
if self.guidance_max_phases < 1: guide_scale = 1
|
||||
if n_prompt is None or len(n_prompt) == 0: n_prompt = "low quality, ugly, unfinished, out of focus, deformed, disfigure, blurry, smudged, restricted palette, flat colors"
|
||||
device="cuda"
|
||||
flux_dev_uso = self.name in ['flux-dev-uso']
|
||||
image_stiching = not self.name in ['flux-dev-uso']
|
||||
|
||||
input_ref_images = [] if input_ref_images is None else input_ref_images[:]
|
||||
ref_style_imgs = []
|
||||
|
||||
if "I" in video_prompt_type and len(input_ref_images) > 0:
|
||||
if flux_dev_uso :
|
||||
if "J" in video_prompt_type:
|
||||
ref_style_imgs = input_ref_images
|
||||
input_ref_images = []
|
||||
elif len(input_ref_images) > 1 :
|
||||
ref_style_imgs = input_ref_images[-1:]
|
||||
input_ref_images = input_ref_images[:-1]
|
||||
if image_stiching:
|
||||
# image stiching method
|
||||
stiched = input_ref_images[0]
|
||||
if "K" in video_prompt_type :
|
||||
w, h = input_ref_images[0].size
|
||||
height, width = calculate_new_dimensions(height, width, h, w, fit_into_canvas)
|
||||
|
||||
for new_img in input_ref_images[1:]:
|
||||
stiched = stitch_images(stiched, new_img)
|
||||
input_ref_images = [stiched]
|
||||
else:
|
||||
first_ref = 0
|
||||
if "K" in video_prompt_type:
|
||||
# image latents tiling method
|
||||
w, h = input_ref_images[0].size
|
||||
height, width = calculate_new_dimensions(height, width, h, w, fit_into_canvas)
|
||||
input_ref_images[0] = input_ref_images[0].resize((width, height), resample=Image.Resampling.LANCZOS)
|
||||
first_ref = 1
|
||||
|
||||
for i in range(first_ref,len(input_ref_images)):
|
||||
w, h = input_ref_images[i].size
|
||||
image_height, image_width = calculate_new_dimensions(int(height*image_refs_relative_size/100), int(width*image_refs_relative_size/100), h, w, fit_into_canvas)
|
||||
input_ref_images[0] = input_ref_images[0].resize((image_width, image_height), resample=Image.Resampling.LANCZOS)
|
||||
else:
|
||||
input_ref_images = None
|
||||
|
||||
if flux_dev_uso :
|
||||
inp, height, width = prepare_multi_ip(
|
||||
ae=self.vae,
|
||||
img_cond_list=input_ref_images,
|
||||
target_width=width,
|
||||
target_height=height,
|
||||
bs=batch_size,
|
||||
seed=seed,
|
||||
device=device,
|
||||
)
|
||||
else:
|
||||
inp, height, width = prepare_kontext(
|
||||
ae=self.vae,
|
||||
img_cond_list=input_ref_images,
|
||||
target_width=width,
|
||||
target_height=height,
|
||||
bs=batch_size,
|
||||
seed=seed,
|
||||
device=device,
|
||||
)
|
||||
|
||||
inp.update(prepare_prompt(self.t5, self.clip, batch_size, input_prompt))
|
||||
if guide_scale != 1:
|
||||
inp.update(prepare_prompt(self.t5, self.clip, batch_size, n_prompt, neg = True, device=device))
|
||||
|
||||
timesteps = get_schedule(sampling_steps, inp["img"].shape[1], shift=(self.name != "flux-schnell"))
|
||||
|
||||
ref_style_imgs = [self.vision_encoder_processor(img, return_tensors="pt").to(self.device) for img in ref_style_imgs]
|
||||
if self.feature_embedder is not None and ref_style_imgs is not None and len(ref_style_imgs) > 0 and self.vision_encoder is not None:
|
||||
# processing style feat into textural hidden space
|
||||
siglip_embedding = [self.vision_encoder(**emb, output_hidden_states=True) for emb in ref_style_imgs]
|
||||
siglip_embedding = torch.cat([self.feature_embedder(emb) for emb in siglip_embedding], dim=1)
|
||||
siglip_embedding_ids = torch.zeros( siglip_embedding.shape[0], siglip_embedding.shape[1], 3 ).to(device)
|
||||
inp["siglip_embedding"] = siglip_embedding
|
||||
inp["siglip_embedding_ids"] = siglip_embedding_ids
|
||||
|
||||
def unpack_latent(x):
|
||||
return unpack(x.float(), height, width)
|
||||
|
||||
# denoise initial noise
|
||||
x = denoise(self.model, **inp, timesteps=timesteps, guidance=embedded_guidance_scale, real_guidance_scale =guide_scale, callback=callback, pipeline=self, loras_slists= loras_slists, unpack_latent = unpack_latent, joint_pass = joint_pass)
|
||||
if x==None: return None
|
||||
# decode latents to pixel space
|
||||
x = unpack_latent(x)
|
||||
with torch.autocast(device_type=device, dtype=torch.bfloat16):
|
||||
x = self.vae.decode(x)
|
||||
|
||||
x = x.clamp(-1, 1)
|
||||
x = x.transpose(0, 1)
|
||||
return x
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
import torch
|
||||
from einops import rearrange
|
||||
from torch import Tensor
|
||||
from wan.modules.attention import pay_attention
|
||||
from shared.attention import pay_attention
|
||||
|
||||
|
||||
def attention(qkv_list, pe: Tensor) -> Tensor:
|
||||
296
models/flux/model.py
Normal file
@ -0,0 +1,296 @@
|
||||
from dataclasses import dataclass
|
||||
|
||||
import torch
|
||||
from torch import Tensor, nn
|
||||
|
||||
from .modules.layers import (
|
||||
DoubleStreamBlock,
|
||||
EmbedND,
|
||||
LastLayer,
|
||||
MLPEmbedder,
|
||||
SingleStreamBlock,
|
||||
timestep_embedding,
|
||||
DistilledGuidance,
|
||||
ChromaModulationOut,
|
||||
SigLIPMultiFeatProjModel,
|
||||
)
|
||||
from .modules.lora import LinearLora, replace_linear_with_lora
|
||||
|
||||
|
||||
@dataclass
|
||||
class FluxParams:
|
||||
in_channels: int
|
||||
out_channels: int
|
||||
vec_in_dim: int
|
||||
context_in_dim: int
|
||||
hidden_size: int
|
||||
mlp_ratio: float
|
||||
num_heads: int
|
||||
depth: int
|
||||
depth_single_blocks: int
|
||||
axes_dim: list[int]
|
||||
theta: int
|
||||
qkv_bias: bool
|
||||
guidance_embed: bool
|
||||
chroma: bool = False
|
||||
eso: bool = False
|
||||
|
||||
class Flux(nn.Module):
|
||||
"""
|
||||
Transformer model for flow matching on sequences.
|
||||
"""
|
||||
def get_modulations(self, tensor: torch.Tensor, block_type: str, *, idx: int = 0):
|
||||
# This function slices up the modulations tensor which has the following layout:
|
||||
# single : num_single_blocks * 3 elements
|
||||
# double_img : num_double_blocks * 6 elements
|
||||
# double_txt : num_double_blocks * 6 elements
|
||||
# final : 2 elements
|
||||
if block_type == "final":
|
||||
return (tensor[:, -2:-1, :], tensor[:, -1:, :])
|
||||
single_block_count = self.params.depth_single_blocks
|
||||
double_block_count = self.params.depth
|
||||
offset = 3 * idx
|
||||
if block_type == "single":
|
||||
return ChromaModulationOut.from_offset(tensor, offset)
|
||||
# Double block modulations are 6 elements so we double 3 * idx.
|
||||
offset *= 2
|
||||
if block_type in {"double_img", "double_txt"}:
|
||||
# Advance past the single block modulations.
|
||||
offset += 3 * single_block_count
|
||||
if block_type == "double_txt":
|
||||
# Advance past the double block img modulations.
|
||||
offset += 6 * double_block_count
|
||||
return (
|
||||
ChromaModulationOut.from_offset(tensor, offset),
|
||||
ChromaModulationOut.from_offset(tensor, offset + 3),
|
||||
)
|
||||
raise ValueError("Bad block_type")
|
||||
|
||||
def __init__(self, params: FluxParams):
|
||||
super().__init__()
|
||||
|
||||
self.params = params
|
||||
self.in_channels = params.in_channels
|
||||
self.out_channels = params.out_channels
|
||||
self.chroma = params.chroma
|
||||
if params.hidden_size % params.num_heads != 0:
|
||||
raise ValueError(
|
||||
f"Hidden size {params.hidden_size} must be divisible by num_heads {params.num_heads}"
|
||||
)
|
||||
pe_dim = params.hidden_size // params.num_heads
|
||||
if sum(params.axes_dim) != pe_dim:
|
||||
raise ValueError(f"Got {params.axes_dim} but expected positional dim {pe_dim}")
|
||||
self.hidden_size = params.hidden_size
|
||||
self.num_heads = params.num_heads
|
||||
self.pe_embedder = EmbedND(dim=pe_dim, theta=params.theta, axes_dim=params.axes_dim)
|
||||
self.img_in = nn.Linear(self.in_channels, self.hidden_size, bias=True)
|
||||
|
||||
self.guidance_in = (
|
||||
MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size) if params.guidance_embed else nn.Identity()
|
||||
)
|
||||
self.txt_in = nn.Linear(params.context_in_dim, self.hidden_size)
|
||||
if self.chroma:
|
||||
self.distilled_guidance_layer = DistilledGuidance(
|
||||
in_dim=64,
|
||||
hidden_dim=5120,
|
||||
out_dim=3072,
|
||||
n_layers=5,
|
||||
)
|
||||
else:
|
||||
self.time_in = MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size)
|
||||
self.vector_in = MLPEmbedder(params.vec_in_dim, self.hidden_size)
|
||||
|
||||
self.double_blocks = nn.ModuleList(
|
||||
[
|
||||
DoubleStreamBlock(
|
||||
self.hidden_size,
|
||||
self.num_heads,
|
||||
mlp_ratio=params.mlp_ratio,
|
||||
qkv_bias=params.qkv_bias,
|
||||
chroma_modulation = self.chroma,
|
||||
)
|
||||
for _ in range(params.depth)
|
||||
]
|
||||
)
|
||||
|
||||
self.single_blocks = nn.ModuleList(
|
||||
[
|
||||
SingleStreamBlock(self.hidden_size, self.num_heads, mlp_ratio=params.mlp_ratio, chroma_modulation = self.chroma)
|
||||
for _ in range(params.depth_single_blocks)
|
||||
]
|
||||
)
|
||||
|
||||
self.final_layer = LastLayer(self.hidden_size, 1, self.out_channels, chroma_modulation = self.chroma)
|
||||
|
||||
def preprocess_loras(self, model_type, sd):
|
||||
new_sd = {}
|
||||
if len(sd) == 0: return sd
|
||||
|
||||
def swap_scale_shift(weight):
|
||||
shift, scale = weight.chunk(2, dim=0)
|
||||
new_weight = torch.cat([scale, shift], dim=0)
|
||||
return new_weight
|
||||
|
||||
first_key= next(iter(sd))
|
||||
if first_key.startswith("lora_unet_"):
|
||||
new_sd = {}
|
||||
print("Converting Lora Safetensors format to Lora Diffusers format")
|
||||
repl_list = ["linear1", "linear2", "modulation", "img_attn", "txt_attn", "img_mlp", "txt_mlp", "img_mod", "txt_mod"]
|
||||
src_list = ["_" + k + "." for k in repl_list]
|
||||
src_list2 = ["_" + k + "_" for k in repl_list]
|
||||
tgt_list = ["." + k + "." for k in repl_list]
|
||||
|
||||
for k,v in sd.items():
|
||||
k = k.replace("lora_unet_blocks_","diffusion_model.blocks.")
|
||||
k = k.replace("lora_unet__blocks_","diffusion_model.blocks.")
|
||||
k = k.replace("lora_unet_single_blocks_","diffusion_model.single_blocks.")
|
||||
k = k.replace("lora_unet_double_blocks_","diffusion_model.double_blocks.")
|
||||
|
||||
for s,s2, t in zip(src_list, src_list2, tgt_list):
|
||||
k = k.replace(s,t)
|
||||
k = k.replace(s2,t)
|
||||
|
||||
k = k.replace("lora_up","lora_B")
|
||||
k = k.replace("lora_down","lora_A")
|
||||
|
||||
new_sd[k] = v
|
||||
|
||||
elif first_key.startswith("transformer."):
|
||||
root_src = ["time_text_embed.timestep_embedder.linear_1", "time_text_embed.timestep_embedder.linear_2", "time_text_embed.text_embedder.linear_1", "time_text_embed.text_embedder.linear_2",
|
||||
"time_text_embed.guidance_embedder.linear_1", "time_text_embed.guidance_embedder.linear_2",
|
||||
"x_embedder", "context_embedder", "proj_out" ]
|
||||
|
||||
root_tgt = ["time_in.in_layer", "time_in.out_layer", "vector_in.in_layer", "vector_in.out_layer",
|
||||
"guidance_in.in_layer", "guidance_in.out_layer",
|
||||
"img_in", "txt_in", "final_layer.linear" ]
|
||||
|
||||
double_src = ["norm1.linear", "norm1_context.linear", "attn.norm_q", "attn.norm_k", "ff.net.0.proj", "ff.net.2", "ff_context.net.0.proj", "ff_context.net.2", "attn.to_out.0" ,"attn.to_add_out", "attn.to_out", ".attn.to_", ".attn.add_q_proj.", ".attn.add_k_proj.", ".attn.add_v_proj.", ]
|
||||
double_tgt = ["img_mod.lin", "txt_mod.lin", "img_attn.norm.query_norm", "img_attn.norm.key_norm", "img_mlp.0", "img_mlp.2", "txt_mlp.0", "txt_mlp.2", "img_attn.proj", "txt_attn.proj", "img_attn.proj", ".img_attn.", ".txt_attn.q.", ".txt_attn.k.", ".txt_attn.v."]
|
||||
|
||||
single_src = ["norm.linear", "attn.norm_q", "attn.norm_k", "proj_out",".attn.to_q.", ".attn.to_k.", ".attn.to_v.", ".proj_mlp."]
|
||||
single_tgt = ["modulation.lin","norm.query_norm", "norm.key_norm", "linear2", ".linear1_attn_q.", ".linear1_attn_k.", ".linear1_attn_v.", ".linear1_mlp."]
|
||||
|
||||
|
||||
for k,v in sd.items():
|
||||
if k.startswith("transformer.single_transformer_blocks"):
|
||||
k = k.replace("transformer.single_transformer_blocks", "diffusion_model.single_blocks")
|
||||
for src, tgt in zip(single_src, single_tgt):
|
||||
k = k.replace(src, tgt)
|
||||
elif k.startswith("transformer.transformer_blocks"):
|
||||
k = k.replace("transformer.transformer_blocks", "diffusion_model.double_blocks")
|
||||
for src, tgt in zip(double_src, double_tgt):
|
||||
k = k.replace(src, tgt)
|
||||
else:
|
||||
k = k.replace("transformer.", "diffusion_model.")
|
||||
for src, tgt in zip(root_src, root_tgt):
|
||||
k = k.replace(src, tgt)
|
||||
|
||||
if "norm_out.linear" in k:
|
||||
if "lora_B" in k:
|
||||
v = swap_scale_shift(v)
|
||||
k = k.replace("norm_out.linear", "final_layer.adaLN_modulation.1")
|
||||
new_sd[k] = v
|
||||
else:
|
||||
new_sd = sd
|
||||
return new_sd
|
||||
|
||||
def forward(
|
||||
self,
|
||||
img: Tensor,
|
||||
img_ids: Tensor,
|
||||
txt_list,
|
||||
txt_ids_list,
|
||||
timesteps: Tensor,
|
||||
y_list,
|
||||
img_len = 0,
|
||||
guidance: Tensor | None = None,
|
||||
callback= None,
|
||||
pipeline =None,
|
||||
siglip_embedding = None,
|
||||
siglip_embedding_ids = None,
|
||||
) -> Tensor:
|
||||
|
||||
sz = len(txt_list)
|
||||
# running on sequences img
|
||||
img = self.img_in(img)
|
||||
img_list = [img] if sz==1 else [img, img.clone()]
|
||||
|
||||
if self.chroma:
|
||||
mod_index_length = 344
|
||||
distill_timestep = timestep_embedding(timesteps, 16).to(img.device, img.dtype)
|
||||
guidance = torch.tensor([0.]* distill_timestep.shape[0])
|
||||
distil_guidance = timestep_embedding(guidance, 16).to(img.device, img.dtype)
|
||||
modulation_index = timestep_embedding(torch.arange(mod_index_length, device=img.device), 32).to(img.device, img.dtype)
|
||||
modulation_index = modulation_index.unsqueeze(0).repeat(img.shape[0], 1, 1).to(img.device, img.dtype)
|
||||
timestep_guidance = torch.cat([distill_timestep, distil_guidance], dim=1).unsqueeze(1).repeat(1, mod_index_length, 1).to(img.dtype).to(img.device, img.dtype)
|
||||
input_vec = torch.cat([timestep_guidance, modulation_index], dim=-1).to(img.device, img.dtype)
|
||||
mod_vectors = self.distilled_guidance_layer(input_vec)
|
||||
else:
|
||||
vec = self.time_in(timestep_embedding(timesteps, 256))
|
||||
if self.params.guidance_embed:
|
||||
if guidance is None:
|
||||
raise ValueError("Didn't get guidance strength for guidance distilled model.")
|
||||
vec += self.guidance_in(timestep_embedding(guidance, 256))
|
||||
vec_list = [ vec + self.vector_in(y) for y in y_list]
|
||||
|
||||
img = None
|
||||
txt_list = [self.txt_in(txt) for txt in txt_list ]
|
||||
if siglip_embedding is not None:
|
||||
txt_list = [torch.cat((siglip_embedding, txt) , dim=1) for txt in txt_list]
|
||||
txt_ids_list = [torch.cat((siglip_embedding_ids, txt_id) , dim=1) for txt_id in txt_ids_list]
|
||||
|
||||
pe_list = [self.pe_embedder(torch.cat((txt_ids, img_ids), dim=1)) for txt_ids in txt_ids_list]
|
||||
|
||||
for i, block in enumerate(self.double_blocks):
|
||||
if self.chroma: vec_list = [( self.get_modulations(mod_vectors, "double_img", idx=i), self.get_modulations(mod_vectors, "double_txt", idx=i))] * sz
|
||||
if callback != None:
|
||||
callback(-1, None, False, True)
|
||||
if pipeline._interrupt:
|
||||
return [None] * sz
|
||||
for img, txt, pe, vec in zip(img_list, txt_list, pe_list, vec_list):
|
||||
img[...], txt[...] = block(img=img, txt=txt, vec=vec, pe=pe)
|
||||
img = txt = pe = vec= None
|
||||
|
||||
img_list = [torch.cat((txt, img), 1) for txt, img in zip(txt_list, img_list)]
|
||||
|
||||
for i, block in enumerate(self.single_blocks):
|
||||
if self.chroma: vec_list= [self.get_modulations(mod_vectors, "single", idx=i)] * sz
|
||||
if callback != None:
|
||||
callback(-1, None, False, True)
|
||||
if pipeline._interrupt:
|
||||
return [None] * sz
|
||||
for img, pe, vec in zip(img_list, pe_list, vec_list):
|
||||
img[...]= block(x=img, vec=vec, pe=pe)
|
||||
img = pe = vec = None
|
||||
img_list = [ img[:, txt.shape[1] : txt.shape[1] + img_len, ...] for img, txt in zip(img_list, txt_list)]
|
||||
|
||||
if self.chroma: vec_list = [self.get_modulations(mod_vectors, "final")] * sz
|
||||
out_list = []
|
||||
for i, (img, vec) in enumerate(zip(img_list, vec_list)):
|
||||
out_list.append( self.final_layer(img, vec)) # (N, T, patch_size ** 2 * out_channels)
|
||||
img_list[i] = img = vec = None
|
||||
return out_list
|
||||
|
||||
|
||||
class FluxLoraWrapper(Flux):
|
||||
def __init__(
|
||||
self,
|
||||
lora_rank: int = 128,
|
||||
lora_scale: float = 1.0,
|
||||
*args,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
self.lora_rank = lora_rank
|
||||
|
||||
replace_linear_with_lora(
|
||||
self,
|
||||
max_rank=lora_rank,
|
||||
scale=lora_scale,
|
||||
)
|
||||
|
||||
def set_lora_scale(self, scale: float) -> None:
|
||||
for module in self.modules():
|
||||
if isinstance(module, LinearLora):
|
||||
module.set_scale(scale=scale)
|
||||
@ -7,7 +7,7 @@ from safetensors.torch import load_file as load_sft
|
||||
from torch import nn
|
||||
from transformers import AutoModelForDepthEstimation, AutoProcessor, SiglipImageProcessor, SiglipVisionModel
|
||||
|
||||
from flux.util import print_load_warning
|
||||
from ..util import print_load_warning
|
||||
|
||||
|
||||
class DepthImageEncoder:
|
||||
@ -5,13 +5,14 @@ import torch
|
||||
from einops import rearrange
|
||||
from torch import Tensor, nn
|
||||
|
||||
from flux.math import attention, rope
|
||||
from ..math import attention, rope
|
||||
|
||||
def get_linear_split_map():
|
||||
hidden_size = 3072
|
||||
split_linear_modules_map = {
|
||||
"qkv" : {"mapped_modules" : ["q", "k", "v"] , "split_sizes": [hidden_size, hidden_size, hidden_size]},
|
||||
"linear1" : {"mapped_modules" : ["linear1_attn_q", "linear1_attn_k", "linear1_attn_v", "linear1_mlp"] , "split_sizes": [hidden_size, hidden_size, hidden_size, 7*hidden_size- 3*hidden_size]}
|
||||
"linear1" : {"mapped_modules" : ["linear1_attn_q", "linear1_attn_k", "linear1_attn_v", "linear1_mlp"] , "split_sizes": [hidden_size, hidden_size, hidden_size, 7*hidden_size- 3*hidden_size]},
|
||||
"linear1_qkv" : {"mapped_modules" : ["linear1_attn_q", "linear1_attn_k", "linear1_attn_v"] , "split_sizes": [hidden_size, hidden_size, hidden_size]},
|
||||
}
|
||||
return split_linear_modules_map
|
||||
|
||||
@ -116,10 +117,20 @@ class ModulationOut:
|
||||
scale: Tensor
|
||||
gate: Tensor
|
||||
|
||||
class ChromaModulationOut(ModulationOut):
|
||||
@classmethod
|
||||
def from_offset(cls, tensor: torch.Tensor, offset: int = 0):
|
||||
return cls(
|
||||
shift=tensor[:, offset : offset + 1, :],
|
||||
scale=tensor[:, offset + 1 : offset + 2, :],
|
||||
gate=tensor[:, offset + 2 : offset + 3, :],
|
||||
)
|
||||
|
||||
def split_mlp(mlp, x, divide = 4):
|
||||
|
||||
def split_mlp(mlp, x, divide = 8):
|
||||
x_shape = x.shape
|
||||
x = x.view(-1, x.shape[-1])
|
||||
chunk_size = int(x.shape[0]/divide)
|
||||
chunk_size = int(x_shape[1]/divide)
|
||||
x_chunks = torch.split(x, chunk_size)
|
||||
for i, x_chunk in enumerate(x_chunks):
|
||||
@ -145,13 +156,15 @@ class Modulation(nn.Module):
|
||||
|
||||
|
||||
class DoubleStreamBlock(nn.Module):
|
||||
def __init__(self, hidden_size: int, num_heads: int, mlp_ratio: float, qkv_bias: bool = False):
|
||||
def __init__(self, hidden_size: int, num_heads: int, mlp_ratio: float, qkv_bias: bool = False, chroma_modulation = False):
|
||||
super().__init__()
|
||||
|
||||
mlp_hidden_dim = int(hidden_size * mlp_ratio)
|
||||
self.num_heads = num_heads
|
||||
self.hidden_size = hidden_size
|
||||
self.img_mod = Modulation(hidden_size, double=True)
|
||||
self.chroma_modulation = chroma_modulation
|
||||
if not chroma_modulation:
|
||||
self.img_mod = Modulation(hidden_size, double=True)
|
||||
self.img_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
|
||||
self.img_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias)
|
||||
|
||||
@ -162,7 +175,8 @@ class DoubleStreamBlock(nn.Module):
|
||||
nn.Linear(mlp_hidden_dim, hidden_size, bias=True),
|
||||
)
|
||||
|
||||
self.txt_mod = Modulation(hidden_size, double=True)
|
||||
if not chroma_modulation:
|
||||
self.txt_mod = Modulation(hidden_size, double=True)
|
||||
self.txt_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
|
||||
self.txt_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias)
|
||||
|
||||
@ -174,8 +188,11 @@ class DoubleStreamBlock(nn.Module):
|
||||
)
|
||||
|
||||
def forward(self, img: Tensor, txt: Tensor, vec: Tensor, pe: Tensor) -> tuple[Tensor, Tensor]:
|
||||
img_mod1, img_mod2 = self.img_mod(vec)
|
||||
txt_mod1, txt_mod2 = self.txt_mod(vec)
|
||||
if self.chroma_modulation:
|
||||
(img_mod1, img_mod2), (txt_mod1, txt_mod2) = vec
|
||||
else:
|
||||
img_mod1, img_mod2 = self.img_mod(vec)
|
||||
txt_mod1, txt_mod2 = self.txt_mod(vec)
|
||||
|
||||
# prepare image for attention
|
||||
img_modulated = self.img_norm1(img)
|
||||
@ -249,10 +266,12 @@ class SingleStreamBlock(nn.Module):
|
||||
num_heads: int,
|
||||
mlp_ratio: float = 4.0,
|
||||
qk_scale: float | None = None,
|
||||
chroma_modulation = False,
|
||||
):
|
||||
super().__init__()
|
||||
self.hidden_dim = hidden_size
|
||||
self.num_heads = num_heads
|
||||
self.chroma_modulation = chroma_modulation
|
||||
head_dim = hidden_size // num_heads
|
||||
self.scale = qk_scale or head_dim**-0.5
|
||||
|
||||
@ -268,10 +287,14 @@ class SingleStreamBlock(nn.Module):
|
||||
self.pre_norm = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
|
||||
|
||||
self.mlp_act = nn.GELU(approximate="tanh")
|
||||
self.modulation = Modulation(hidden_size, double=False)
|
||||
if not chroma_modulation:
|
||||
self.modulation = Modulation(hidden_size, double=False)
|
||||
|
||||
def forward(self, x: Tensor, vec: Tensor, pe: Tensor) -> Tensor:
|
||||
mod, _ = self.modulation(vec)
|
||||
if self.chroma_modulation:
|
||||
mod = vec
|
||||
else:
|
||||
mod, _ = self.modulation(vec)
|
||||
x_mod = self.pre_norm(x)
|
||||
x_mod.mul_(1 + mod.scale)
|
||||
x_mod.add_(mod.shift)
|
||||
@ -315,14 +338,172 @@ class SingleStreamBlock(nn.Module):
|
||||
|
||||
|
||||
class LastLayer(nn.Module):
|
||||
def __init__(self, hidden_size: int, patch_size: int, out_channels: int):
|
||||
def __init__(self, hidden_size: int, patch_size: int, out_channels: int, chroma_modulation = False):
|
||||
super().__init__()
|
||||
self.chroma_modulation = chroma_modulation
|
||||
self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
|
||||
self.linear = nn.Linear(hidden_size, patch_size * patch_size * out_channels, bias=True)
|
||||
self.adaLN_modulation = nn.Sequential(nn.SiLU(), nn.Linear(hidden_size, 2 * hidden_size, bias=True))
|
||||
if not chroma_modulation:
|
||||
self.adaLN_modulation = nn.Sequential(nn.SiLU(), nn.Linear(hidden_size, 2 * hidden_size, bias=True))
|
||||
|
||||
def forward(self, x: Tensor, vec: Tensor) -> Tensor:
|
||||
shift, scale = self.adaLN_modulation(vec).chunk(2, dim=1)
|
||||
x = (1 + scale[:, None, :]) * self.norm_final(x) + shift[:, None, :]
|
||||
if self.chroma_modulation:
|
||||
shift, scale = vec
|
||||
shift = shift.squeeze(1)
|
||||
scale = scale.squeeze(1)
|
||||
else:
|
||||
shift, scale = self.adaLN_modulation(vec).chunk(2, dim=1)
|
||||
# x = (1 + scale[:, None, :]) * self.norm_final(x) + shift[:, None, :]
|
||||
x = torch.addcmul(shift[:, None, :], 1 + scale[:, None, :], self.norm_final(x))
|
||||
x = self.linear(x)
|
||||
return x
|
||||
|
||||
|
||||
class DistilledGuidance(nn.Module):
|
||||
def __init__(self, in_dim: int, out_dim: int, hidden_dim: int, n_layers = 5):
|
||||
super().__init__()
|
||||
self.in_proj = nn.Linear(in_dim, hidden_dim, bias=True)
|
||||
self.layers = nn.ModuleList([MLPEmbedder(hidden_dim, hidden_dim) for x in range( n_layers)])
|
||||
self.norms = nn.ModuleList([RMSNorm(hidden_dim) for x in range( n_layers)])
|
||||
self.out_proj = nn.Linear(hidden_dim, out_dim)
|
||||
|
||||
|
||||
def forward(self, x: Tensor) -> Tensor:
|
||||
x = self.in_proj(x)
|
||||
|
||||
for layer, norms in zip(self.layers, self.norms):
|
||||
x = x + layer(norms(x))
|
||||
|
||||
x = self.out_proj(x)
|
||||
|
||||
return x
|
||||
|
||||
|
||||
class SigLIPMultiFeatProjModel(torch.nn.Module):
|
||||
"""
|
||||
SigLIP Multi-Feature Projection Model for processing style features from different layers
|
||||
and projecting them into a unified hidden space.
|
||||
|
||||
Args:
|
||||
siglip_token_nums (int): Number of SigLIP tokens, default 257
|
||||
style_token_nums (int): Number of style tokens, default 256
|
||||
siglip_token_dims (int): Dimension of SigLIP tokens, default 1536
|
||||
hidden_size (int): Hidden layer size, default 3072
|
||||
context_layer_norm (bool): Whether to use context layer normalization, default False
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
siglip_token_nums: int = 257,
|
||||
style_token_nums: int = 256,
|
||||
siglip_token_dims: int = 1536,
|
||||
hidden_size: int = 3072,
|
||||
context_layer_norm: bool = False,
|
||||
):
|
||||
super().__init__()
|
||||
|
||||
# High-level feature processing (layer -2)
|
||||
self.high_embedding_linear = nn.Sequential(
|
||||
nn.Linear(siglip_token_nums, style_token_nums),
|
||||
nn.SiLU()
|
||||
)
|
||||
self.high_layer_norm = (
|
||||
nn.LayerNorm(siglip_token_dims) if context_layer_norm else nn.Identity()
|
||||
)
|
||||
self.high_projection = nn.Linear(siglip_token_dims, hidden_size, bias=True)
|
||||
|
||||
# Mid-level feature processing (layer -11)
|
||||
self.mid_embedding_linear = nn.Sequential(
|
||||
nn.Linear(siglip_token_nums, style_token_nums),
|
||||
nn.SiLU()
|
||||
)
|
||||
self.mid_layer_norm = (
|
||||
nn.LayerNorm(siglip_token_dims) if context_layer_norm else nn.Identity()
|
||||
)
|
||||
self.mid_projection = nn.Linear(siglip_token_dims, hidden_size, bias=True)
|
||||
|
||||
# Low-level feature processing (layer -20)
|
||||
self.low_embedding_linear = nn.Sequential(
|
||||
nn.Linear(siglip_token_nums, style_token_nums),
|
||||
nn.SiLU()
|
||||
)
|
||||
self.low_layer_norm = (
|
||||
nn.LayerNorm(siglip_token_dims) if context_layer_norm else nn.Identity()
|
||||
)
|
||||
self.low_projection = nn.Linear(siglip_token_dims, hidden_size, bias=True)
|
||||
|
||||
def forward(self, siglip_outputs):
|
||||
"""
|
||||
Forward pass function
|
||||
|
||||
Args:
|
||||
siglip_outputs: Output from SigLIP model, containing hidden_states
|
||||
|
||||
Returns:
|
||||
torch.Tensor: Concatenated multi-layer features with shape [bs, 3*style_token_nums, hidden_size]
|
||||
"""
|
||||
dtype = next(self.high_embedding_linear.parameters()).dtype
|
||||
|
||||
# Process high-level features (layer -2)
|
||||
high_embedding = self._process_layer_features(
|
||||
siglip_outputs.hidden_states[-2],
|
||||
self.high_embedding_linear,
|
||||
self.high_layer_norm,
|
||||
self.high_projection,
|
||||
dtype
|
||||
)
|
||||
|
||||
# Process mid-level features (layer -11)
|
||||
mid_embedding = self._process_layer_features(
|
||||
siglip_outputs.hidden_states[-11],
|
||||
self.mid_embedding_linear,
|
||||
self.mid_layer_norm,
|
||||
self.mid_projection,
|
||||
dtype
|
||||
)
|
||||
|
||||
# Process low-level features (layer -20)
|
||||
low_embedding = self._process_layer_features(
|
||||
siglip_outputs.hidden_states[-20],
|
||||
self.low_embedding_linear,
|
||||
self.low_layer_norm,
|
||||
self.low_projection,
|
||||
dtype
|
||||
)
|
||||
|
||||
# Concatenate features from all layers
|
||||
return torch.cat((high_embedding, mid_embedding, low_embedding), dim=1)
|
||||
|
||||
def _process_layer_features(
|
||||
self,
|
||||
hidden_states: torch.Tensor,
|
||||
embedding_linear: nn.Module,
|
||||
layer_norm: nn.Module,
|
||||
projection: nn.Module,
|
||||
dtype: torch.dtype
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Helper function to process features from a single layer
|
||||
|
||||
Args:
|
||||
hidden_states: Input hidden states [bs, seq_len, dim]
|
||||
embedding_linear: Embedding linear layer
|
||||
layer_norm: Layer normalization
|
||||
projection: Projection layer
|
||||
dtype: Target data type
|
||||
|
||||
Returns:
|
||||
torch.Tensor: Processed features [bs, style_token_nums, hidden_size]
|
||||
"""
|
||||
# Transform dimensions: [bs, seq_len, dim] -> [bs, dim, seq_len] -> [bs, dim, style_token_nums] -> [bs, style_token_nums, dim]
|
||||
embedding = embedding_linear(
|
||||
hidden_states.to(dtype).transpose(1, 2)
|
||||
).transpose(1, 2)
|
||||
|
||||
# Apply layer normalization
|
||||
embedding = layer_norm(embedding)
|
||||
|
||||
# Project to target hidden space
|
||||
embedding = projection(embedding)
|
||||
|
||||
return embedding
|
||||
429
models/flux/sampling.py
Normal file
@ -0,0 +1,429 @@
|
||||
import math
|
||||
from typing import Callable
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from einops import rearrange, repeat
|
||||
from PIL import Image
|
||||
from torch import Tensor
|
||||
|
||||
from .model import Flux
|
||||
from .modules.autoencoder import AutoEncoder
|
||||
from .modules.conditioner import HFEmbedder
|
||||
from .modules.image_embedders import CannyImageEncoder, DepthImageEncoder, ReduxImageEncoder
|
||||
from .util import PREFERED_KONTEXT_RESOLUTIONS
|
||||
from einops import rearrange, repeat
|
||||
from typing import Literal
|
||||
import torchvision.transforms.functional as TVF
|
||||
|
||||
|
||||
def get_noise(
|
||||
num_samples: int,
|
||||
height: int,
|
||||
width: int,
|
||||
device: torch.device,
|
||||
dtype: torch.dtype,
|
||||
seed: int,
|
||||
):
|
||||
return torch.randn(
|
||||
num_samples,
|
||||
16,
|
||||
# allow for packing
|
||||
2 * math.ceil(height / 16),
|
||||
2 * math.ceil(width / 16),
|
||||
dtype=dtype,
|
||||
device=device,
|
||||
generator=torch.Generator(device=device).manual_seed(seed),
|
||||
)
|
||||
|
||||
|
||||
def prepare_prompt(t5: HFEmbedder, clip: HFEmbedder, bs: int, prompt: str | list[str], neg: bool = False, device: str = "cuda") -> dict[str, Tensor]:
|
||||
if bs == 1 and not isinstance(prompt, str):
|
||||
bs = len(prompt)
|
||||
|
||||
if isinstance(prompt, str):
|
||||
prompt = [prompt]
|
||||
txt = t5(prompt)
|
||||
if txt.shape[0] == 1 and bs > 1:
|
||||
txt = repeat(txt, "1 ... -> bs ...", bs=bs)
|
||||
txt_ids = torch.zeros(bs, txt.shape[1], 3)
|
||||
|
||||
vec = clip(prompt)
|
||||
if vec.shape[0] == 1 and bs > 1:
|
||||
vec = repeat(vec, "1 ... -> bs ...", bs=bs)
|
||||
|
||||
return {
|
||||
"neg_txt" if neg else "txt": txt.to(device),
|
||||
"neg_txt_ids" if neg else "txt_ids": txt_ids.to(device),
|
||||
"neg_vec" if neg else "vec": vec.to(device),
|
||||
}
|
||||
|
||||
|
||||
def prepare_img( img: Tensor) -> dict[str, Tensor]:
|
||||
bs, c, h, w = img.shape
|
||||
|
||||
img = rearrange(img, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=2, pw=2)
|
||||
if img.shape[0] == 1 and bs > 1:
|
||||
img = repeat(img, "1 ... -> bs ...", bs=bs)
|
||||
|
||||
img_ids = torch.zeros(h // 2, w // 2, 3)
|
||||
img_ids[..., 1] = img_ids[..., 1] + torch.arange(h // 2)[:, None]
|
||||
img_ids[..., 2] = img_ids[..., 2] + torch.arange(w // 2)[None, :]
|
||||
img_ids = repeat(img_ids, "h w c -> b (h w) c", b=bs)
|
||||
|
||||
return {
|
||||
"img": img,
|
||||
"img_ids": img_ids.to(img.device),
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def prepare_redux(
|
||||
t5: HFEmbedder,
|
||||
clip: HFEmbedder,
|
||||
img: Tensor,
|
||||
prompt: str | list[str],
|
||||
encoder: ReduxImageEncoder,
|
||||
img_cond_path: str,
|
||||
) -> dict[str, Tensor]:
|
||||
bs, _, h, w = img.shape
|
||||
if bs == 1 and not isinstance(prompt, str):
|
||||
bs = len(prompt)
|
||||
|
||||
img_cond = Image.open(img_cond_path).convert("RGB")
|
||||
with torch.no_grad():
|
||||
img_cond = encoder(img_cond)
|
||||
|
||||
img_cond = img_cond.to(torch.bfloat16)
|
||||
if img_cond.shape[0] == 1 and bs > 1:
|
||||
img_cond = repeat(img_cond, "1 ... -> bs ...", bs=bs)
|
||||
|
||||
img = rearrange(img, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=2, pw=2)
|
||||
if img.shape[0] == 1 and bs > 1:
|
||||
img = repeat(img, "1 ... -> bs ...", bs=bs)
|
||||
|
||||
img_ids = torch.zeros(h // 2, w // 2, 3)
|
||||
img_ids[..., 1] = img_ids[..., 1] + torch.arange(h // 2)[:, None]
|
||||
img_ids[..., 2] = img_ids[..., 2] + torch.arange(w // 2)[None, :]
|
||||
img_ids = repeat(img_ids, "h w c -> b (h w) c", b=bs)
|
||||
|
||||
if isinstance(prompt, str):
|
||||
prompt = [prompt]
|
||||
txt = t5(prompt)
|
||||
txt = torch.cat((txt, img_cond.to(txt)), dim=-2)
|
||||
if txt.shape[0] == 1 and bs > 1:
|
||||
txt = repeat(txt, "1 ... -> bs ...", bs=bs)
|
||||
txt_ids = torch.zeros(bs, txt.shape[1], 3)
|
||||
|
||||
vec = clip(prompt)
|
||||
if vec.shape[0] == 1 and bs > 1:
|
||||
vec = repeat(vec, "1 ... -> bs ...", bs=bs)
|
||||
|
||||
return {
|
||||
"img": img,
|
||||
"img_ids": img_ids.to(img.device),
|
||||
"txt": txt.to(img.device),
|
||||
"txt_ids": txt_ids.to(img.device),
|
||||
"vec": vec.to(img.device),
|
||||
}
|
||||
|
||||
|
||||
def prepare_kontext(
|
||||
ae: AutoEncoder,
|
||||
img_cond_list: list,
|
||||
seed: int,
|
||||
device: torch.device,
|
||||
target_width: int | None = None,
|
||||
target_height: int | None = None,
|
||||
bs: int = 1,
|
||||
|
||||
) -> tuple[dict[str, Tensor], int, int]:
|
||||
# load and encode the conditioning image
|
||||
|
||||
img_cond_seq = None
|
||||
img_cond_seq_ids = None
|
||||
if img_cond_list == None: img_cond_list = []
|
||||
height_offset = 0
|
||||
width_offset = 0
|
||||
for cond_no, img_cond in enumerate(img_cond_list):
|
||||
width, height = img_cond.size
|
||||
aspect_ratio = width / height
|
||||
|
||||
# Kontext is trained on specific resolutions, using one of them is recommended
|
||||
_, width, height = min((abs(aspect_ratio - w / h), w, h) for w, h in PREFERED_KONTEXT_RESOLUTIONS)
|
||||
|
||||
width = 2 * int(width / 16)
|
||||
height = 2 * int(height / 16)
|
||||
|
||||
img_cond = img_cond.resize((8 * width, 8 * height), Image.Resampling.LANCZOS)
|
||||
img_cond = np.array(img_cond)
|
||||
img_cond = torch.from_numpy(img_cond).float() / 127.5 - 1.0
|
||||
img_cond = rearrange(img_cond, "h w c -> 1 c h w")
|
||||
with torch.no_grad():
|
||||
img_cond_latents = ae.encode(img_cond.to(device))
|
||||
|
||||
img_cond_latents = img_cond_latents.to(torch.bfloat16)
|
||||
img_cond_latents = rearrange(img_cond_latents, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=2, pw=2)
|
||||
if img_cond.shape[0] == 1 and bs > 1:
|
||||
img_cond_latents = repeat(img_cond_latents, "1 ... -> bs ...", bs=bs)
|
||||
img_cond = None
|
||||
|
||||
# image ids are the same as base image with the first dimension set to 1
|
||||
# instead of 0
|
||||
img_cond_ids = torch.zeros(height // 2, width // 2, 3)
|
||||
img_cond_ids[..., 0] = 1
|
||||
img_cond_ids[..., 1] = img_cond_ids[..., 1] + torch.arange(height // 2)[:, None] + height_offset
|
||||
img_cond_ids[..., 2] = img_cond_ids[..., 2] + torch.arange(width // 2)[None, :] + width_offset
|
||||
img_cond_ids = repeat(img_cond_ids, "h w c -> b (h w) c", b=bs)
|
||||
height_offset += height // 2
|
||||
width_offset += width // 2
|
||||
|
||||
if target_width is None:
|
||||
target_width = 8 * width
|
||||
if target_height is None:
|
||||
target_height = 8 * height
|
||||
img_cond_ids = img_cond_ids.to(device)
|
||||
if cond_no == 0:
|
||||
img_cond_seq, img_cond_seq_ids = img_cond_latents, img_cond_ids
|
||||
else:
|
||||
img_cond_seq, img_cond_seq_ids = torch.cat([img_cond_seq, img_cond_latents], dim=1), torch.cat([img_cond_seq_ids, img_cond_ids], dim=1)
|
||||
|
||||
return_dict = {
|
||||
"img_cond_seq": img_cond_seq,
|
||||
"img_cond_seq_ids": img_cond_seq_ids,
|
||||
}
|
||||
img = get_noise(
|
||||
bs,
|
||||
target_height,
|
||||
target_width,
|
||||
device=device,
|
||||
dtype=torch.bfloat16,
|
||||
seed=seed,
|
||||
)
|
||||
return_dict.update(prepare_img(img))
|
||||
|
||||
return return_dict, target_height, target_width
|
||||
|
||||
|
||||
def time_shift(mu: float, sigma: float, t: Tensor):
|
||||
return math.exp(mu) / (math.exp(mu) + (1 / t - 1) ** sigma)
|
||||
|
||||
|
||||
def get_lin_function(
|
||||
x1: float = 256, y1: float = 0.5, x2: float = 4096, y2: float = 1.15
|
||||
) -> Callable[[float], float]:
|
||||
m = (y2 - y1) / (x2 - x1)
|
||||
b = y1 - m * x1
|
||||
return lambda x: m * x + b
|
||||
|
||||
|
||||
def get_schedule(
|
||||
num_steps: int,
|
||||
image_seq_len: int,
|
||||
base_shift: float = 0.5,
|
||||
max_shift: float = 1.15,
|
||||
shift: bool = True,
|
||||
) -> list[float]:
|
||||
# extra step for zero
|
||||
timesteps = torch.linspace(1, 0, num_steps + 1)
|
||||
|
||||
# shifting the schedule to favor high timesteps for higher signal images
|
||||
if shift:
|
||||
# estimate mu based on linear estimation between two points
|
||||
mu = get_lin_function(y1=base_shift, y2=max_shift)(image_seq_len)
|
||||
timesteps = time_shift(mu, 1.0, timesteps)
|
||||
|
||||
return timesteps.tolist()
|
||||
|
||||
|
||||
def denoise(
|
||||
model: Flux,
|
||||
# model input
|
||||
img: Tensor,
|
||||
img_ids: Tensor,
|
||||
txt: Tensor,
|
||||
txt_ids: Tensor,
|
||||
vec: Tensor,
|
||||
# sampling parameters
|
||||
timesteps: list[float],
|
||||
guidance: float = 4.0,
|
||||
real_guidance_scale = None,
|
||||
# extra img tokens (channel-wise)
|
||||
neg_txt: Tensor = None,
|
||||
neg_txt_ids: Tensor= None,
|
||||
neg_vec: Tensor = None,
|
||||
img_cond: Tensor | None = None,
|
||||
# extra img tokens (sequence-wise)
|
||||
img_cond_seq: Tensor | None = None,
|
||||
img_cond_seq_ids: Tensor | None = None,
|
||||
siglip_embedding = None,
|
||||
siglip_embedding_ids = None,
|
||||
callback=None,
|
||||
pipeline=None,
|
||||
loras_slists=None,
|
||||
unpack_latent = None,
|
||||
joint_pass= False,
|
||||
):
|
||||
|
||||
kwargs = {'pipeline': pipeline, 'callback': callback, "img_len" : img.shape[1], "siglip_embedding": siglip_embedding, "siglip_embedding_ids": siglip_embedding_ids}
|
||||
|
||||
if callback != None:
|
||||
callback(-1, None, True)
|
||||
|
||||
updated_num_steps= len(timesteps) -1
|
||||
if callback != None:
|
||||
from shared.utils.loras_mutipliers import update_loras_slists
|
||||
update_loras_slists(model, loras_slists, updated_num_steps)
|
||||
callback(-1, None, True, override_num_inference_steps = updated_num_steps)
|
||||
from mmgp import offload
|
||||
# this is ignored for schnell
|
||||
guidance_vec = torch.full((img.shape[0],), guidance, device=img.device, dtype=img.dtype)
|
||||
for i, (t_curr, t_prev) in enumerate(zip(timesteps[:-1], timesteps[1:])):
|
||||
offload.set_step_no_for_lora(model, i)
|
||||
if pipeline._interrupt:
|
||||
return None
|
||||
|
||||
t_vec = torch.full((img.shape[0],), t_curr, dtype=img.dtype, device=img.device)
|
||||
img_input = img
|
||||
img_input_ids = img_ids
|
||||
if img_cond is not None:
|
||||
img_input = torch.cat((img, img_cond), dim=-1)
|
||||
if img_cond_seq is not None:
|
||||
img_input = torch.cat((img_input, img_cond_seq), dim=1)
|
||||
img_input_ids = torch.cat((img_input_ids, img_cond_seq_ids), dim=1)
|
||||
if not joint_pass or real_guidance_scale == 1:
|
||||
pred = model(
|
||||
img=img_input,
|
||||
img_ids=img_input_ids,
|
||||
txt_list=[txt],
|
||||
txt_ids_list=[txt_ids],
|
||||
y_list=[vec],
|
||||
timesteps=t_vec,
|
||||
guidance=guidance_vec,
|
||||
**kwargs
|
||||
)[0]
|
||||
if pred == None: return None
|
||||
if real_guidance_scale> 1:
|
||||
neg_pred = model(
|
||||
img=img_input,
|
||||
img_ids=img_input_ids,
|
||||
txt_list=[neg_txt],
|
||||
txt_ids_list=[neg_txt_ids],
|
||||
y_list=[neg_vec],
|
||||
timesteps=t_vec,
|
||||
guidance=guidance_vec,
|
||||
**kwargs
|
||||
)[0]
|
||||
if neg_pred == None: return None
|
||||
else:
|
||||
pred, neg_pred = model(
|
||||
img=img_input,
|
||||
img_ids=img_input_ids,
|
||||
txt_list=[txt, neg_txt],
|
||||
txt_ids_list=[txt_ids, neg_txt_ids],
|
||||
y_list=[vec, neg_vec],
|
||||
timesteps=t_vec,
|
||||
guidance=guidance_vec,
|
||||
**kwargs
|
||||
)
|
||||
if pred == None: return None
|
||||
|
||||
if real_guidance_scale > 1:
|
||||
pred = neg_pred + real_guidance_scale * (pred - neg_pred)
|
||||
|
||||
img += (t_prev - t_curr) * pred
|
||||
if callback is not None:
|
||||
preview = unpack_latent(img).transpose(0,1)
|
||||
callback(i, preview, False)
|
||||
|
||||
|
||||
return img
|
||||
|
||||
def prepare_multi_ip(
|
||||
ae: AutoEncoder,
|
||||
img_cond_list: list,
|
||||
seed: int,
|
||||
device: torch.device,
|
||||
target_width: int | None = None,
|
||||
target_height: int | None = None,
|
||||
bs: int = 1,
|
||||
pe: Literal["d", "h", "w", "o"] = "d",
|
||||
) -> dict[str, Tensor]:
|
||||
ref_imgs = img_cond_list
|
||||
assert pe in ["d", "h", "w", "o"]
|
||||
|
||||
ref_imgs = [
|
||||
ae.encode(
|
||||
(TVF.to_tensor(ref_img) * 2.0 - 1.0)
|
||||
.unsqueeze(0)
|
||||
.to(device, torch.float32)
|
||||
).to(torch.bfloat16)
|
||||
for ref_img in img_cond_list
|
||||
]
|
||||
|
||||
img = get_noise( bs, target_height, target_width, device=device, dtype=torch.bfloat16, seed=seed)
|
||||
bs, c, h, w = img.shape
|
||||
# tgt img
|
||||
img = rearrange(img, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=2, pw=2)
|
||||
if img.shape[0] == 1 and bs > 1:
|
||||
img = repeat(img, "1 ... -> bs ...", bs=bs)
|
||||
|
||||
img_ids = torch.zeros(h // 2, w // 2, 3)
|
||||
img_ids[..., 1] = img_ids[..., 1] + torch.arange(h // 2)[:, None]
|
||||
img_ids[..., 2] = img_ids[..., 2] + torch.arange(w // 2)[None, :]
|
||||
img_ids = repeat(img_ids, "h w c -> b (h w) c", b=bs)
|
||||
img_cond_seq = img_cond_seq_ids = None
|
||||
pe_shift_w, pe_shift_h = w // 2, h // 2
|
||||
for cond_no, ref_img in enumerate(ref_imgs):
|
||||
_, _, ref_h1, ref_w1 = ref_img.shape
|
||||
ref_img = rearrange(
|
||||
ref_img, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=2, pw=2
|
||||
)
|
||||
if ref_img.shape[0] == 1 and bs > 1:
|
||||
ref_img = repeat(ref_img, "1 ... -> bs ...", bs=bs)
|
||||
ref_img_ids1 = torch.zeros(ref_h1 // 2, ref_w1 // 2, 3)
|
||||
# img id分别在宽高偏移各自最大值
|
||||
h_offset = pe_shift_h if pe in {"d", "h"} else 0
|
||||
w_offset = pe_shift_w if pe in {"d", "w"} else 0
|
||||
ref_img_ids1[..., 1] = (
|
||||
ref_img_ids1[..., 1] + torch.arange(ref_h1 // 2)[:, None] + h_offset
|
||||
)
|
||||
ref_img_ids1[..., 2] = (
|
||||
ref_img_ids1[..., 2] + torch.arange(ref_w1 // 2)[None, :] + w_offset
|
||||
)
|
||||
ref_img_ids1 = repeat(ref_img_ids1, "h w c -> b (h w) c", b=bs)
|
||||
|
||||
if target_width is None:
|
||||
target_width = 8 * ref_w1
|
||||
if target_height is None:
|
||||
target_height = 8 * ref_h1
|
||||
ref_img_ids1 = ref_img_ids1.to(device)
|
||||
if cond_no == 0:
|
||||
img_cond_seq, img_cond_seq_ids = ref_img, ref_img_ids1
|
||||
else:
|
||||
img_cond_seq, img_cond_seq_ids = torch.cat([img_cond_seq, ref_img], dim=1), torch.cat([img_cond_seq_ids, ref_img_ids1], dim=1)
|
||||
|
||||
|
||||
# 更新pe shift
|
||||
pe_shift_h += ref_h1 // 2
|
||||
pe_shift_w += ref_w1 // 2
|
||||
|
||||
return {
|
||||
"img": img,
|
||||
"img_ids": img_ids.to(img.device),
|
||||
"img_cond_seq": img_cond_seq,
|
||||
"img_cond_seq_ids": img_cond_seq_ids,
|
||||
}, target_height, target_width
|
||||
|
||||
|
||||
def unpack(x: Tensor, height: int, width: int) -> Tensor:
|
||||
return rearrange(
|
||||
x,
|
||||
"b (h w) (c ph pw) -> b c (h ph) (w pw)",
|
||||
h=math.ceil(height / 16),
|
||||
w=math.ceil(width / 16),
|
||||
ph=2,
|
||||
pw=2,
|
||||
)
|
||||
@ -11,16 +11,13 @@ from huggingface_hub import hf_hub_download, login
|
||||
from PIL import ExifTags, Image
|
||||
from safetensors.torch import load_file as load_sft
|
||||
|
||||
from flux.model import Flux, FluxLoraWrapper, FluxParams
|
||||
from flux.modules.autoencoder import AutoEncoder, AutoEncoderParams
|
||||
from flux.modules.conditioner import HFEmbedder
|
||||
from .model import Flux, FluxLoraWrapper, FluxParams
|
||||
from .modules.autoencoder import AutoEncoder, AutoEncoderParams
|
||||
from .modules.conditioner import HFEmbedder
|
||||
|
||||
CHECKPOINTS_DIR = Path("checkpoints")
|
||||
CHECKPOINTS_DIR.mkdir(exist_ok=True)
|
||||
BFL_API_KEY = os.getenv("BFL_API_KEY")
|
||||
|
||||
os.environ.setdefault("TRT_ENGINE_DIR", str(CHECKPOINTS_DIR / "trt_engines"))
|
||||
(CHECKPOINTS_DIR / "trt_engines").mkdir(exist_ok=True)
|
||||
BFL_API_KEY = os.getenv("BFL_API_KEY")
|
||||
|
||||
|
||||
def ensure_hf_auth():
|
||||
@ -358,6 +355,38 @@ configs = {
|
||||
shift_factor=0.1159,
|
||||
),
|
||||
),
|
||||
"flux-chroma": ModelSpec(
|
||||
repo_id="lodestones/Chroma1-HD",
|
||||
repo_flow="",
|
||||
repo_ae="ckpts/flux_vae.safetensors",
|
||||
params=FluxParams(
|
||||
in_channels=64,
|
||||
out_channels=64,
|
||||
vec_in_dim=768,
|
||||
context_in_dim=4096,
|
||||
hidden_size=3072,
|
||||
mlp_ratio=4.0,
|
||||
num_heads=24,
|
||||
depth=19,
|
||||
depth_single_blocks=38,
|
||||
axes_dim=[16, 56, 56],
|
||||
theta=10_000,
|
||||
qkv_bias=True,
|
||||
guidance_embed=False,
|
||||
chroma=True,
|
||||
),
|
||||
ae_params=AutoEncoderParams(
|
||||
resolution=256,
|
||||
in_channels=3,
|
||||
ch=128,
|
||||
out_ch=3,
|
||||
ch_mult=[1, 2, 4, 4],
|
||||
num_res_blocks=2,
|
||||
z_channels=16,
|
||||
scale_factor=0.3611,
|
||||
shift_factor=0.1159,
|
||||
),
|
||||
),
|
||||
"flux-dev-canny": ModelSpec(
|
||||
repo_id="black-forest-labs/FLUX.1-Canny-dev",
|
||||
repo_flow="",
|
||||
@ -579,6 +608,38 @@ configs = {
|
||||
shift_factor=0.1159,
|
||||
),
|
||||
),
|
||||
"flux-dev-uso": ModelSpec(
|
||||
repo_id="",
|
||||
repo_flow="",
|
||||
repo_ae="ckpts/flux_vae.safetensors",
|
||||
params=FluxParams(
|
||||
in_channels=64,
|
||||
out_channels=64,
|
||||
vec_in_dim=768,
|
||||
context_in_dim=4096,
|
||||
hidden_size=3072,
|
||||
mlp_ratio=4.0,
|
||||
num_heads=24,
|
||||
depth=19,
|
||||
depth_single_blocks=38,
|
||||
axes_dim=[16, 56, 56],
|
||||
theta=10_000,
|
||||
qkv_bias=True,
|
||||
guidance_embed=True,
|
||||
eso= True,
|
||||
),
|
||||
ae_params=AutoEncoderParams(
|
||||
resolution=256,
|
||||
in_channels=3,
|
||||
ch=128,
|
||||
out_ch=3,
|
||||
ch_mult=[1, 2, 4, 4],
|
||||
num_res_blocks=2,
|
||||
z_channels=16,
|
||||
scale_factor=0.3611,
|
||||
shift_factor=0.1159,
|
||||
),
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
2
models/hyvideo/__init__.py
Normal file
@ -0,0 +1,2 @@
|
||||
from .hunyuan import HunyuanVideoSampler
|
||||
from . import hunyuan_handler
|
||||
@ -249,7 +249,7 @@ class DetFace():
|
||||
for scale in [8,16,32]:
|
||||
ny = h1//scale
|
||||
nx = w1//scale
|
||||
yv, xv = torch.meshgrid([torch.arange(ny), torch.arange(nx)])
|
||||
yv, xv = torch.meshgrid([torch.arange(ny), torch.arange(nx)], indexing="ij")
|
||||
grid = torch.stack((xv, yv), 2).view((1,1,ny, nx, 2)).float()
|
||||
grids.append(grid.to(self.test_device))
|
||||
self.grids = grids
|
||||