From 6aed2bb402511a29f905155591dcb46decfaebc3 Mon Sep 17 00:00:00 2001 From: bmaltais Date: Sun, 15 Jan 2023 11:05:22 -0500 Subject: [PATCH] Add support for new arguments: - max_train_epochs - max_data_loader_n_workers Move some of the codeto common gui library. --- LICENSE.md | 201 ++++++ README.md | 7 + dreambooth_gui.py | 25 + fine_tune.py | 688 ++++++++------------ finetune/make_captions.py | 2 +- finetune_gui copy.py | 789 ----------------------- finetune_gui.py | 11 + gen_img_diffusers.py | 182 +++--- library/common_gui.py | 29 +- library/model_util.py | 13 +- library/train_util.py | 21 +- lora_gui.py | 14 +- networks/extract_lora_from_models.py | 2 +- networks/lora.py | 9 +- tools/convert_diffusers20_original_sd.py | 2 +- tools/detect_face_rotate.py | 2 +- train_db.py | 7 +- train_network.py | 70 +- 18 files changed, 781 insertions(+), 1293 deletions(-) create mode 100644 LICENSE.md delete mode 100644 finetune_gui copy.py diff --git a/LICENSE.md b/LICENSE.md new file mode 100644 index 0000000..56765e7 --- /dev/null +++ b/LICENSE.md @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [2022] [kohya-ss] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/README.md b/README.md index 742bd9a..2e9238c 100644 --- a/README.md +++ b/README.md @@ -99,6 +99,13 @@ Once you have created the LoRA network you can generate images via auto1111 by i ## Change history +* 2023/01/15 (v20.2.1): + - Merging latest code update from kohya + - Added `--max_train_epochs` and `--max_data_loader_n_workers` option for each training script. + - If you specify the number of training epochs with `--max_train_epochs`, the number of steps is calculated from the number of epochs automatically. + - You can set the number of workers for DataLoader with `--max_data_loader_n_workers`, default is 8. The lower number may reduce the main memory usage and the time between epochs, but may cause slower dataloading (training). + - Fix loading some VAE or .safetensors as VAE is failed for `--vae` option. Thanks to Fannovel16! + - Add negative prompt scaling for `gen_img_diffusers.py` You can set another conditioning scale to the negative prompt with `--negative_scale` option, and `--nl` option for the prompt. Thanks to laksjdjf! * 2023/01/11 (v20.2.0): - Add support for max token lenght * 2023/01/10 (v20.1.1): diff --git a/dreambooth_gui.py b/dreambooth_gui.py index f5ea216..914bb8f 100644 --- a/dreambooth_gui.py +++ b/dreambooth_gui.py @@ -20,6 +20,8 @@ from library.common_gui import ( color_aug_changed, save_inference_file, set_pretrained_model_name_or_path_input, + gradio_advanced_training, + run_cmd_advanced_training, ) from library.dreambooth_folder_creation_gui import ( gradio_dreambooth_folder_creation_tab, @@ -74,6 +76,8 @@ def save_configuration( vae, output_name, max_token_length, + max_train_epochs, + max_data_loader_n_workers, ): # Get list of function parameters and values parameters = list(locals().items()) @@ -153,6 +157,8 @@ def open_configuration( vae, output_name, max_token_length, + max_train_epochs, + max_data_loader_n_workers, ): # Get list of function parameters and values parameters = list(locals().items()) @@ -216,6 +222,8 @@ def train_model( vae, output_name, max_token_length, + max_train_epochs, + max_data_loader_n_workers, ): if pretrained_model_name_or_path == '': msgbox('Source model information is missing') @@ -372,6 +380,11 @@ def train_model( run_cmd += f' --output_name="{output_name}"' if (int(max_token_length) > 75): run_cmd += f' --max_token_length={max_token_length}' + if not max_train_epochs == '': + run_cmd += f' --max_train_epochs="{max_train_epochs}"' + if not max_data_loader_n_workers == '': + run_cmd += f' --max_data_loader_n_workers="{max_data_loader_n_workers}"' + run_cmd += run_cmd_advanced_training(max_train_epochs=max_train_epochs, max_data_loader_n_workers=max_data_loader_n_workers) print(run_cmd) # Run the command @@ -708,6 +721,16 @@ def dreambooth_tab( ], value='75', ) + max_train_epochs, max_data_loader_n_workers = gradio_advanced_training() + # with gr.Row(): + # max_train_epochs = gr.Textbox( + # label='Max train epoch', + # placeholder='(Optional) Override number of epoch', + # ) + # max_data_loader_n_workers = gr.Textbox( + # label='Max num workers for DataLoader', + # placeholder='(Optional) Override number of epoch. Default: 8', + # ) with gr.Tab('Tools'): gr.Markdown( 'This section provide Dreambooth tools to help setup your dataset...' @@ -760,6 +783,8 @@ def dreambooth_tab( vae, output_name, max_token_length, + max_train_epochs, + max_data_loader_n_workers, ] button_open_config.click( diff --git a/fine_tune.py b/fine_tune.py index d0ebd64..02f665b 100644 --- a/fine_tune.py +++ b/fine_tune.py @@ -16,456 +16,326 @@ import library.train_util as train_util def collate_fn(examples): - return examples[0] + return examples[0] def train(args): - train_util.verify_training_args(args) - train_util.prepare_dataset_args(args, True) + train_util.verify_training_args(args) + train_util.prepare_dataset_args(args, True) - cache_latents = args.cache_latents + cache_latents = args.cache_latents - if args.seed is not None: - set_seed(args.seed) # 乱数系列を初期化する + if args.seed is not None: + set_seed(args.seed) # 乱数系列を初期化する - tokenizer = train_util.load_tokenizer(args) + tokenizer = train_util.load_tokenizer(args) - train_dataset = train_util.FineTuningDataset( - args.in_json, - args.train_batch_size, - args.train_data_dir, - tokenizer, - args.max_token_length, - args.shuffle_caption, - args.keep_tokens, - args.resolution, - args.enable_bucket, - args.min_bucket_reso, - args.max_bucket_reso, - args.flip_aug, - args.color_aug, - args.face_crop_aug_range, - args.random_crop, - args.dataset_repeats, - args.debug_dataset, - ) - train_dataset.make_buckets() + train_dataset = train_util.FineTuningDataset(args.in_json, args.train_batch_size, args.train_data_dir, + tokenizer, args.max_token_length, args.shuffle_caption, args.keep_tokens, + args.resolution, args.enable_bucket, args.min_bucket_reso, args.max_bucket_reso, + args.flip_aug, args.color_aug, args.face_crop_aug_range, args.random_crop, + args.dataset_repeats, args.debug_dataset) + train_dataset.make_buckets() - if args.debug_dataset: - train_util.debug_dataset(train_dataset) - return - if len(train_dataset) == 0: - print( - 'No data found. Please verify the metadata file and train_data_dir option. / 画像がありません。メタデータおよびtrain_data_dirオプションを確認してください。' - ) - return + if args.debug_dataset: + train_util.debug_dataset(train_dataset) + return + if len(train_dataset) == 0: + print("No data found. Please verify the metadata file and train_data_dir option. / 画像がありません。メタデータおよびtrain_data_dirオプションを確認してください。") + return - # acceleratorを準備する - print('prepare accelerator') - accelerator, unwrap_model = train_util.prepare_accelerator(args) + # acceleratorを準備する + print("prepare accelerator") + accelerator, unwrap_model = train_util.prepare_accelerator(args) - # mixed precisionに対応した型を用意しておき適宜castする - weight_dtype, save_dtype = train_util.prepare_dtype(args) + # mixed precisionに対応した型を用意しておき適宜castする + weight_dtype, save_dtype = train_util.prepare_dtype(args) - # モデルを読み込む - ( - text_encoder, - vae, - unet, - load_stable_diffusion_format, - ) = train_util.load_target_model(args, weight_dtype) + # モデルを読み込む + text_encoder, vae, unet, load_stable_diffusion_format = train_util.load_target_model(args, weight_dtype) - # verify load/save model formats - if load_stable_diffusion_format: - src_stable_diffusion_ckpt = args.pretrained_model_name_or_path - src_diffusers_model_path = None - else: - src_stable_diffusion_ckpt = None - src_diffusers_model_path = args.pretrained_model_name_or_path + # verify load/save model formats + if load_stable_diffusion_format: + src_stable_diffusion_ckpt = args.pretrained_model_name_or_path + src_diffusers_model_path = None + else: + src_stable_diffusion_ckpt = None + src_diffusers_model_path = args.pretrained_model_name_or_path - if args.save_model_as is None: - save_stable_diffusion_format = load_stable_diffusion_format - use_safetensors = args.use_safetensors - else: - save_stable_diffusion_format = ( - args.save_model_as.lower() == 'ckpt' - or args.save_model_as.lower() == 'safetensors' - ) - use_safetensors = args.use_safetensors or ( - 'safetensors' in args.save_model_as.lower() - ) + if args.save_model_as is None: + save_stable_diffusion_format = load_stable_diffusion_format + use_safetensors = args.use_safetensors + else: + save_stable_diffusion_format = args.save_model_as.lower() == 'ckpt' or args.save_model_as.lower() == 'safetensors' + use_safetensors = args.use_safetensors or ("safetensors" in args.save_model_as.lower()) - # Diffusers版のxformers使用フラグを設定する関数 - def set_diffusers_xformers_flag(model, valid): - # model.set_use_memory_efficient_attention_xformers(valid) # 次のリリースでなくなりそう - # pipeが自動で再帰的にset_use_memory_efficient_attention_xformersを探すんだって(;´Д`) - # U-Netだけ使う時にはどうすればいいのか……仕方ないからコピって使うか - # 0.10.2でなんか巻き戻って個別に指定するようになった(;^ω^) + # Diffusers版のxformers使用フラグを設定する関数 + def set_diffusers_xformers_flag(model, valid): + # model.set_use_memory_efficient_attention_xformers(valid) # 次のリリースでなくなりそう + # pipeが自動で再帰的にset_use_memory_efficient_attention_xformersを探すんだって(;´Д`) + # U-Netだけ使う時にはどうすればいいのか……仕方ないからコピって使うか + # 0.10.2でなんか巻き戻って個別に指定するようになった(;^ω^) - # Recursively walk through all the children. - # Any children which exposes the set_use_memory_efficient_attention_xformers method - # gets the message - def fn_recursive_set_mem_eff(module: torch.nn.Module): - if hasattr(module, 'set_use_memory_efficient_attention_xformers'): - module.set_use_memory_efficient_attention_xformers(valid) + # Recursively walk through all the children. + # Any children which exposes the set_use_memory_efficient_attention_xformers method + # gets the message + def fn_recursive_set_mem_eff(module: torch.nn.Module): + if hasattr(module, "set_use_memory_efficient_attention_xformers"): + module.set_use_memory_efficient_attention_xformers(valid) - for child in module.children(): - fn_recursive_set_mem_eff(child) + for child in module.children(): + fn_recursive_set_mem_eff(child) - fn_recursive_set_mem_eff(model) + fn_recursive_set_mem_eff(model) - # モデルに xformers とか memory efficient attention を組み込む - if args.diffusers_xformers: - print('Use xformers by Diffusers') - set_diffusers_xformers_flag(unet, True) - else: - # Windows版のxformersはfloatで学習できないのでxformersを使わない設定も可能にしておく必要がある - print("Disable Diffusers' xformers") - set_diffusers_xformers_flag(unet, False) - train_util.replace_unet_modules(unet, args.mem_eff_attn, args.xformers) + # モデルに xformers とか memory efficient attention を組み込む + if args.diffusers_xformers: + print("Use xformers by Diffusers") + set_diffusers_xformers_flag(unet, True) + else: + # Windows版のxformersはfloatで学習できないのでxformersを使わない設定も可能にしておく必要がある + print("Disable Diffusers' xformers") + set_diffusers_xformers_flag(unet, False) + train_util.replace_unet_modules(unet, args.mem_eff_attn, args.xformers) - # 学習を準備する - if cache_latents: - vae.to(accelerator.device, dtype=weight_dtype) - vae.requires_grad_(False) - vae.eval() - with torch.no_grad(): - train_dataset.cache_latents(vae) - vae.to('cpu') - if torch.cuda.is_available(): - torch.cuda.empty_cache() - gc.collect() + # 学習を準備する + if cache_latents: + vae.to(accelerator.device, dtype=weight_dtype) + vae.requires_grad_(False) + vae.eval() + with torch.no_grad(): + train_dataset.cache_latents(vae) + vae.to("cpu") + if torch.cuda.is_available(): + torch.cuda.empty_cache() + gc.collect() - # 学習を準備する:モデルを適切な状態にする - training_models = [] + # 学習を準備する:モデルを適切な状態にする + training_models = [] + if args.gradient_checkpointing: + unet.enable_gradient_checkpointing() + training_models.append(unet) + + if args.train_text_encoder: + print("enable text encoder training") if args.gradient_checkpointing: - unet.enable_gradient_checkpointing() - training_models.append(unet) - - if args.train_text_encoder: - print('enable text encoder training') - if args.gradient_checkpointing: - text_encoder.gradient_checkpointing_enable() - training_models.append(text_encoder) + text_encoder.gradient_checkpointing_enable() + training_models.append(text_encoder) + else: + text_encoder.to(accelerator.device, dtype=weight_dtype) + text_encoder.requires_grad_(False) # text encoderは学習しない + if args.gradient_checkpointing: + text_encoder.gradient_checkpointing_enable() + text_encoder.train() # required for gradient_checkpointing else: - text_encoder.to(accelerator.device, dtype=weight_dtype) - text_encoder.requires_grad_(False) # text encoderは学習しない - if args.gradient_checkpointing: - text_encoder.gradient_checkpointing_enable() - text_encoder.train() # required for gradient_checkpointing + text_encoder.eval() + + if not cache_latents: + vae.requires_grad_(False) + vae.eval() + vae.to(accelerator.device, dtype=weight_dtype) + + for m in training_models: + m.requires_grad_(True) + params = [] + for m in training_models: + params.extend(m.parameters()) + params_to_optimize = params + + # 学習に必要なクラスを準備する + print("prepare optimizer, data loader etc.") + + # 8-bit Adamを使う + if args.use_8bit_adam: + try: + import bitsandbytes as bnb + except ImportError: + raise ImportError("No bitsand bytes / bitsandbytesがインストールされていないようです") + print("use 8-bit Adam optimizer") + optimizer_class = bnb.optim.AdamW8bit + else: + optimizer_class = torch.optim.AdamW + + # betaやweight decayはdiffusers DreamBoothもDreamBooth SDもデフォルト値のようなのでオプションはとりあえず省略 + optimizer = optimizer_class(params_to_optimize, lr=args.learning_rate) + + # dataloaderを準備する + # DataLoaderのプロセス数:0はメインプロセスになる + n_workers = min(args.max_data_loader_n_workers, os.cpu_count() - 1) # cpu_count-1 ただし最大で指定された数まで + train_dataloader = torch.utils.data.DataLoader( + train_dataset, batch_size=1, shuffle=False, collate_fn=collate_fn, num_workers=n_workers) + + # 学習ステップ数を計算する + if args.max_train_epochs is not None: + args.max_train_steps = args.max_train_epochs * len(train_dataloader) + print(f"override steps. steps for {args.max_train_epochs} epochs is / 指定エポックまでのステップ数: {args.max_train_steps}") + + # lr schedulerを用意する + lr_scheduler = diffusers.optimization.get_scheduler( + args.lr_scheduler, optimizer, num_warmup_steps=args.lr_warmup_steps, num_training_steps=args.max_train_steps * args.gradient_accumulation_steps) + + # 実験的機能:勾配も含めたfp16学習を行う モデル全体をfp16にする + if args.full_fp16: + assert args.mixed_precision == "fp16", "full_fp16 requires mixed precision='fp16' / full_fp16を使う場合はmixed_precision='fp16'を指定してください。" + print("enable full fp16 training.") + unet.to(weight_dtype) + text_encoder.to(weight_dtype) + + # acceleratorがなんかよろしくやってくれるらしい + if args.train_text_encoder: + unet, text_encoder, optimizer, train_dataloader, lr_scheduler = accelerator.prepare( + unet, text_encoder, optimizer, train_dataloader, lr_scheduler) + else: + unet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(unet, optimizer, train_dataloader, lr_scheduler) + + # 実験的機能:勾配も含めたfp16学習を行う PyTorchにパッチを当ててfp16でのgrad scaleを有効にする + if args.full_fp16: + train_util.patch_accelerator_for_fp16_training(accelerator) + + # resumeする + if args.resume is not None: + print(f"resume training from state: {args.resume}") + accelerator.load_state(args.resume) + + # epoch数を計算する + num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) + num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch) + + # 学習する + total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps + print("running training / 学習開始") + print(f" num examples / サンプル数: {train_dataset.num_train_images}") + print(f" num batches per epoch / 1epochのバッチ数: {len(train_dataloader)}") + print(f" num epochs / epoch数: {num_train_epochs}") + print(f" batch size per device / バッチサイズ: {args.train_batch_size}") + print(f" total train batch size (with parallel & distributed & accumulation) / 総バッチサイズ(並列学習、勾配合計含む): {total_batch_size}") + print(f" gradient ccumulation steps / 勾配を合計するステップ数 = {args.gradient_accumulation_steps}") + print(f" total optimization steps / 学習ステップ数: {args.max_train_steps}") + + progress_bar = tqdm(range(args.max_train_steps), smoothing=0, disable=not accelerator.is_local_main_process, desc="steps") + global_step = 0 + + noise_scheduler = DDPMScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", + num_train_timesteps=1000, clip_sample=False) + + if accelerator.is_main_process: + accelerator.init_trackers("finetuning") + + for epoch in range(num_train_epochs): + print(f"epoch {epoch+1}/{num_train_epochs}") + for m in training_models: + m.train() + + loss_total = 0 + for step, batch in enumerate(train_dataloader): + with accelerator.accumulate(training_models[0]): # 複数モデルに対応していない模様だがとりあえずこうしておく + with torch.no_grad(): + if "latents" in batch and batch["latents"] is not None: + latents = batch["latents"].to(accelerator.device) + else: + # latentに変換 + latents = vae.encode(batch["images"].to(dtype=weight_dtype)).latent_dist.sample() + latents = latents * 0.18215 + b_size = latents.shape[0] + + with torch.set_grad_enabled(args.train_text_encoder): + # Get the text embedding for conditioning + input_ids = batch["input_ids"].to(accelerator.device) + encoder_hidden_states = train_util.get_hidden_states( + args, input_ids, tokenizer, text_encoder, None if not args.full_fp16 else weight_dtype) + + # Sample noise that we'll add to the latents + noise = torch.randn_like(latents, device=latents.device) + + # Sample a random timestep for each image + timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (b_size,), device=latents.device) + timesteps = timesteps.long() + + # Add noise to the latents according to the noise magnitude at each timestep + # (this is the forward diffusion process) + noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps) + + # Predict the noise residual + noise_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample + + if args.v_parameterization: + # v-parameterization training + target = noise_scheduler.get_velocity(latents, noise, timesteps) else: - text_encoder.eval() + target = noise - if not cache_latents: - vae.requires_grad_(False) - vae.eval() - vae.to(accelerator.device, dtype=weight_dtype) + loss = torch.nn.functional.mse_loss(noise_pred.float(), target.float(), reduction="mean") - for m in training_models: - m.requires_grad_(True) - params = [] - for m in training_models: - params.extend(m.parameters()) - params_to_optimize = params + accelerator.backward(loss) + if accelerator.sync_gradients: + params_to_clip = [] + for m in training_models: + params_to_clip.extend(m.parameters()) + accelerator.clip_grad_norm_(params_to_clip, 1.0) # args.max_grad_norm) - # 学習に必要なクラスを準備する - print('prepare optimizer, data loader etc.') + optimizer.step() + lr_scheduler.step() + optimizer.zero_grad(set_to_none=True) - # 8-bit Adamを使う - if args.use_8bit_adam: - try: - import bitsandbytes as bnb - except ImportError: - raise ImportError( - 'No bitsand bytes / bitsandbytesがインストールされていないようです' - ) - print('use 8-bit Adam optimizer') - optimizer_class = bnb.optim.AdamW8bit - else: - optimizer_class = torch.optim.AdamW + # Checks if the accelerator has performed an optimization step behind the scenes + if accelerator.sync_gradients: + progress_bar.update(1) + global_step += 1 - # betaやweight decayはdiffusers DreamBoothもDreamBooth SDもデフォルト値のようなのでオプションはとりあえず省略 - optimizer = optimizer_class(params_to_optimize, lr=args.learning_rate) + current_loss = loss.detach().item() # 平均なのでbatch sizeは関係ないはず + if args.logging_dir is not None: + logs = {"loss": current_loss, "lr": lr_scheduler.get_last_lr()[0]} + accelerator.log(logs, step=global_step) - # dataloaderを準備する - # DataLoaderのプロセス数:0はメインプロセスになる - n_workers = min(8, os.cpu_count() - 1) # cpu_count-1 ただし最大8 - train_dataloader = torch.utils.data.DataLoader( - train_dataset, - batch_size=1, - shuffle=False, - collate_fn=collate_fn, - num_workers=n_workers, - ) + loss_total += current_loss + avr_loss = loss_total / (step+1) + logs = {"loss": avr_loss} # , "lr": lr_scheduler.get_last_lr()[0]} + progress_bar.set_postfix(**logs) - # lr schedulerを用意する - lr_scheduler = diffusers.optimization.get_scheduler( - args.lr_scheduler, - optimizer, - num_warmup_steps=args.lr_warmup_steps, - num_training_steps=args.max_train_steps - * args.gradient_accumulation_steps, - ) + if global_step >= args.max_train_steps: + break - # 実験的機能:勾配も含めたfp16学習を行う モデル全体をfp16にする - if args.full_fp16: - assert ( - args.mixed_precision == 'fp16' - ), "full_fp16 requires mixed precision='fp16' / full_fp16を使う場合はmixed_precision='fp16'を指定してください。" - print('enable full fp16 training.') - unet.to(weight_dtype) - text_encoder.to(weight_dtype) + if args.logging_dir is not None: + logs = {"epoch_loss": loss_total / len(train_dataloader)} + accelerator.log(logs, step=epoch+1) - # acceleratorがなんかよろしくやってくれるらしい - if args.train_text_encoder: - ( - unet, - text_encoder, - optimizer, - train_dataloader, - lr_scheduler, - ) = accelerator.prepare( - unet, text_encoder, optimizer, train_dataloader, lr_scheduler - ) - else: - unet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare( - unet, optimizer, train_dataloader, lr_scheduler - ) + accelerator.wait_for_everyone() - # 実験的機能:勾配も含めたfp16学習を行う PyTorchにパッチを当ててfp16でのgrad scaleを有効にする - if args.full_fp16: - train_util.patch_accelerator_for_fp16_training(accelerator) + if args.save_every_n_epochs is not None: + src_path = src_stable_diffusion_ckpt if save_stable_diffusion_format else src_diffusers_model_path + train_util.save_sd_model_on_epoch_end(args, accelerator, src_path, save_stable_diffusion_format, use_safetensors, + save_dtype, epoch, num_train_epochs, global_step, unwrap_model(text_encoder), unwrap_model(unet), vae) - # resumeする - if args.resume is not None: - print(f'resume training from state: {args.resume}') - accelerator.load_state(args.resume) + is_main_process = accelerator.is_main_process + if is_main_process: + unet = unwrap_model(unet) + text_encoder = unwrap_model(text_encoder) - # epoch数を計算する - num_update_steps_per_epoch = math.ceil( - len(train_dataloader) / args.gradient_accumulation_steps - ) - num_train_epochs = math.ceil( - args.max_train_steps / num_update_steps_per_epoch - ) + accelerator.end_training() - # 学習する - total_batch_size = ( - args.train_batch_size - * accelerator.num_processes - * args.gradient_accumulation_steps - ) - print('running training / 学習開始') - print(f' num examples / サンプル数: {train_dataset.num_train_images}') - print(f' num batches per epoch / 1epochのバッチ数: {len(train_dataloader)}') - print(f' num epochs / epoch数: {num_train_epochs}') - print(f' batch size per device / バッチサイズ: {args.train_batch_size}') - print( - f' total train batch size (with parallel & distributed & accumulation) / 総バッチサイズ(並列学習、勾配合計含む): {total_batch_size}' - ) - print( - f' gradient ccumulation steps / 勾配を合計するステップ数 = {args.gradient_accumulation_steps}' - ) - print(f' total optimization steps / 学習ステップ数: {args.max_train_steps}') + if args.save_state: + train_util.save_state_on_train_end(args, accelerator) - progress_bar = tqdm( - range(args.max_train_steps), - smoothing=0, - disable=not accelerator.is_local_main_process, - desc='steps', - ) - global_step = 0 + del accelerator # この後メモリを使うのでこれは消す - noise_scheduler = DDPMScheduler( - beta_start=0.00085, - beta_end=0.012, - beta_schedule='scaled_linear', - num_train_timesteps=1000, - clip_sample=False, - ) - - if accelerator.is_main_process: - accelerator.init_trackers('finetuning') - - for epoch in range(num_train_epochs): - print(f'epoch {epoch+1}/{num_train_epochs}') - for m in training_models: - m.train() - - loss_total = 0 - for step, batch in enumerate(train_dataloader): - with accelerator.accumulate( - training_models[0] - ): # 複数モデルに対応していない模様だがとりあえずこうしておく - with torch.no_grad(): - if 'latents' in batch and batch['latents'] is not None: - latents = batch['latents'].to(accelerator.device) - else: - # latentに変換 - latents = vae.encode( - batch['images'].to(dtype=weight_dtype) - ).latent_dist.sample() - latents = latents * 0.18215 - b_size = latents.shape[0] - - with torch.set_grad_enabled(args.train_text_encoder): - # Get the text embedding for conditioning - input_ids = batch['input_ids'].to(accelerator.device) - encoder_hidden_states = train_util.get_hidden_states( - args, - input_ids, - tokenizer, - text_encoder, - None if not args.full_fp16 else weight_dtype, - ) - - # Sample noise that we'll add to the latents - noise = torch.randn_like(latents, device=latents.device) - - # Sample a random timestep for each image - timesteps = torch.randint( - 0, - noise_scheduler.config.num_train_timesteps, - (b_size,), - device=latents.device, - ) - timesteps = timesteps.long() - - # Add noise to the latents according to the noise magnitude at each timestep - # (this is the forward diffusion process) - noisy_latents = noise_scheduler.add_noise( - latents, noise, timesteps - ) - - # Predict the noise residual - noise_pred = unet( - noisy_latents, timesteps, encoder_hidden_states - ).sample - - if args.v_parameterization: - # v-parameterization training - target = noise_scheduler.get_velocity( - latents, noise, timesteps - ) - else: - target = noise - - loss = torch.nn.functional.mse_loss( - noise_pred.float(), target.float(), reduction='mean' - ) - - accelerator.backward(loss) - if accelerator.sync_gradients: - params_to_clip = [] - for m in training_models: - params_to_clip.extend(m.parameters()) - accelerator.clip_grad_norm_( - params_to_clip, 1.0 - ) # args.max_grad_norm) - - optimizer.step() - lr_scheduler.step() - optimizer.zero_grad(set_to_none=True) - - # Checks if the accelerator has performed an optimization step behind the scenes - if accelerator.sync_gradients: - progress_bar.update(1) - global_step += 1 - - current_loss = loss.detach().item() # 平均なのでbatch sizeは関係ないはず - if args.logging_dir is not None: - logs = { - 'loss': current_loss, - 'lr': lr_scheduler.get_last_lr()[0], - } - accelerator.log(logs, step=global_step) - - loss_total += current_loss - avr_loss = loss_total / (step + 1) - logs = {'loss': avr_loss} # , "lr": lr_scheduler.get_last_lr()[0]} - progress_bar.set_postfix(**logs) - - if global_step >= args.max_train_steps: - break - - if args.logging_dir is not None: - logs = {'epoch_loss': loss_total / len(train_dataloader)} - accelerator.log(logs, step=epoch + 1) - - accelerator.wait_for_everyone() - - if args.save_every_n_epochs is not None: - src_path = ( - src_stable_diffusion_ckpt - if save_stable_diffusion_format - else src_diffusers_model_path - ) - train_util.save_sd_model_on_epoch_end( - args, - accelerator, - src_path, - save_stable_diffusion_format, - use_safetensors, - save_dtype, - epoch, - num_train_epochs, - global_step, - unwrap_model(text_encoder), - unwrap_model(unet), - vae, - ) - - is_main_process = accelerator.is_main_process - if is_main_process: - unet = unwrap_model(unet) - text_encoder = unwrap_model(text_encoder) - - accelerator.end_training() - - if args.save_state: - train_util.save_state_on_train_end(args, accelerator) - - del accelerator # この後メモリを使うのでこれは消す - - if is_main_process: - src_path = ( - src_stable_diffusion_ckpt - if save_stable_diffusion_format - else src_diffusers_model_path - ) - train_util.save_sd_model_on_train_end( - args, - src_path, - save_stable_diffusion_format, - use_safetensors, - save_dtype, - epoch, - global_step, - text_encoder, - unet, - vae, - ) - print('model saved.') + if is_main_process: + src_path = src_stable_diffusion_ckpt if save_stable_diffusion_format else src_diffusers_model_path + train_util.save_sd_model_on_train_end(args, src_path, save_stable_diffusion_format, use_safetensors, + save_dtype, epoch, global_step, text_encoder, unet, vae) + print("model saved.") if __name__ == '__main__': - parser = argparse.ArgumentParser() + parser = argparse.ArgumentParser() - train_util.add_sd_models_arguments(parser) - train_util.add_dataset_arguments(parser, False, True) - train_util.add_training_arguments(parser, False) - train_util.add_sd_saving_arguments(parser) + train_util.add_sd_models_arguments(parser) + train_util.add_dataset_arguments(parser, False, True) + train_util.add_training_arguments(parser, False) + train_util.add_sd_saving_arguments(parser) - parser.add_argument( - '--diffusers_xformers', - action='store_true', - help='use xformers by diffusers / Diffusersでxformersを使用する', - ) - parser.add_argument( - '--train_text_encoder', - action='store_true', - help='train text encoder / text encoderも学習する', - ) + parser.add_argument("--diffusers_xformers", action='store_true', + help='use xformers by diffusers / Diffusersでxformersを使用する') + parser.add_argument("--train_text_encoder", action="store_true", help="train text encoder / text encoderも学習する") - args = parser.parse_args() - train(args) + args = parser.parse_args() + train(args) diff --git a/finetune/make_captions.py b/finetune/make_captions.py index 601f0cc..b02420b 100644 --- a/finetune/make_captions.py +++ b/finetune/make_captions.py @@ -108,4 +108,4 @@ if __name__ == '__main__': if args.caption_extention is not None: args.caption_extension = args.caption_extention - main(args) \ No newline at end of file + main(args) diff --git a/finetune_gui copy.py b/finetune_gui copy.py deleted file mode 100644 index cb9f672..0000000 --- a/finetune_gui copy.py +++ /dev/null @@ -1,789 +0,0 @@ -import gradio as gr -import json -import math -import os -import subprocess -import pathlib -import shutil -import argparse -from library.common_gui import ( - get_folder_path, - get_file_path, - get_any_file_path, - get_saveasfile_path, -) -from library.utilities import utilities_tab - -folder_symbol = '\U0001f4c2' # 📂 -refresh_symbol = '\U0001f504' # 🔄 -save_style_symbol = '\U0001f4be' # 💾 -document_symbol = '\U0001F4C4' # 📄 - - -def save_configuration( - save_as, - file_path, - pretrained_model_name_or_path, - v2, - v_parameterization, - train_dir, - image_folder, - output_dir, - logging_dir, - max_resolution, - min_bucket_reso, - max_bucket_reso, - batch_size, - flip_aug, - caption_metadata_filename, - latent_metadata_filename, - full_path, - learning_rate, - lr_scheduler, - lr_warmup, - dataset_repeats, - train_batch_size, - epoch, - save_every_n_epochs, - mixed_precision, - save_precision, - seed, - num_cpu_threads_per_process, - train_text_encoder, - create_caption, - create_buckets, - save_model_as, - caption_extension, - use_8bit_adam, - xformers, - clip_skip, -): - original_file_path = file_path - - save_as_bool = True if save_as.get('label') == 'True' else False - - if save_as_bool: - print('Save as...') - file_path = get_saveasfile_path(file_path) - else: - print('Save...') - if file_path == None or file_path == '': - file_path = get_saveasfile_path(file_path) - - # print(file_path) - - if file_path == None: - return original_file_path - - # Return the values of the variables as a dictionary - variables = { - 'pretrained_model_name_or_path': pretrained_model_name_or_path, - 'v2': v2, - 'v_parameterization': v_parameterization, - 'train_dir': train_dir, - 'image_folder': image_folder, - 'output_dir': output_dir, - 'logging_dir': logging_dir, - 'max_resolution': max_resolution, - 'min_bucket_reso': min_bucket_reso, - 'max_bucket_reso': max_bucket_reso, - 'batch_size': batch_size, - 'flip_aug': flip_aug, - 'caption_metadata_filename': caption_metadata_filename, - 'latent_metadata_filename': latent_metadata_filename, - 'full_path': full_path, - 'learning_rate': learning_rate, - 'lr_scheduler': lr_scheduler, - 'lr_warmup': lr_warmup, - 'dataset_repeats': dataset_repeats, - 'train_batch_size': train_batch_size, - 'epoch': epoch, - 'save_every_n_epochs': save_every_n_epochs, - 'mixed_precision': mixed_precision, - 'save_precision': save_precision, - 'seed': seed, - 'num_cpu_threads_per_process': num_cpu_threads_per_process, - 'train_text_encoder': train_text_encoder, - 'create_buckets': create_buckets, - 'create_caption': create_caption, - 'save_model_as': save_model_as, - 'caption_extension': caption_extension, - 'use_8bit_adam': use_8bit_adam, - 'xformers': xformers, - 'clip_skip': clip_skip, - } - - # Save the data to the selected file - with open(file_path, 'w') as file: - json.dump(variables, file) - - return file_path - - -def open_config_file( - file_path, - pretrained_model_name_or_path, - v2, - v_parameterization, - train_dir, - image_folder, - output_dir, - logging_dir, - max_resolution, - min_bucket_reso, - max_bucket_reso, - batch_size, - flip_aug, - caption_metadata_filename, - latent_metadata_filename, - full_path, - learning_rate, - lr_scheduler, - lr_warmup, - dataset_repeats, - train_batch_size, - epoch, - save_every_n_epochs, - mixed_precision, - save_precision, - seed, - num_cpu_threads_per_process, - train_text_encoder, - create_caption, - create_buckets, - save_model_as, - caption_extension, - use_8bit_adam, - xformers, - clip_skip, -): - original_file_path = file_path - file_path = get_file_path(file_path) - - if file_path != '' and file_path != None: - print(file_path) - # load variables from JSON file - with open(file_path, 'r') as f: - my_data = json.load(f) - else: - file_path = original_file_path # In case a file_path was provided and the user decide to cancel the open action - my_data = {} - - # Return the values of the variables as a dictionary - return ( - file_path, - my_data.get( - 'pretrained_model_name_or_path', pretrained_model_name_or_path - ), - my_data.get('v2', v2), - my_data.get('v_parameterization', v_parameterization), - my_data.get('train_dir', train_dir), - my_data.get('image_folder', image_folder), - my_data.get('output_dir', output_dir), - my_data.get('logging_dir', logging_dir), - my_data.get('max_resolution', max_resolution), - my_data.get('min_bucket_reso', min_bucket_reso), - my_data.get('max_bucket_reso', max_bucket_reso), - my_data.get('batch_size', batch_size), - my_data.get('flip_aug', flip_aug), - my_data.get('caption_metadata_filename', caption_metadata_filename), - my_data.get('latent_metadata_filename', latent_metadata_filename), - my_data.get('full_path', full_path), - my_data.get('learning_rate', learning_rate), - my_data.get('lr_scheduler', lr_scheduler), - my_data.get('lr_warmup', lr_warmup), - my_data.get('dataset_repeats', dataset_repeats), - my_data.get('train_batch_size', train_batch_size), - my_data.get('epoch', epoch), - my_data.get('save_every_n_epochs', save_every_n_epochs), - my_data.get('mixed_precision', mixed_precision), - my_data.get('save_precision', save_precision), - my_data.get('seed', seed), - my_data.get( - 'num_cpu_threads_per_process', num_cpu_threads_per_process - ), - my_data.get('train_text_encoder', train_text_encoder), - my_data.get('create_buckets', create_buckets), - my_data.get('create_caption', create_caption), - my_data.get('save_model_as', save_model_as), - my_data.get('caption_extension', caption_extension), - my_data.get('use_8bit_adam', use_8bit_adam), - my_data.get('xformers', xformers), - my_data.get('clip_skip', clip_skip), - ) - - -def train_model( - pretrained_model_name_or_path, - v2, - v_parameterization, - train_dir, - image_folder, - output_dir, - logging_dir, - max_resolution, - min_bucket_reso, - max_bucket_reso, - batch_size, - flip_aug, - caption_metadata_filename, - latent_metadata_filename, - full_path, - learning_rate, - lr_scheduler, - lr_warmup, - dataset_repeats, - train_batch_size, - epoch, - save_every_n_epochs, - mixed_precision, - save_precision, - seed, - num_cpu_threads_per_process, - train_text_encoder, - generate_caption_database, - generate_image_buckets, - save_model_as, - caption_extension, - use_8bit_adam, - xformers, - clip_skip, -): - def save_inference_file(output_dir, v2, v_parameterization): - # Copy inference model for v2 if required - if v2 and v_parameterization: - print(f'Saving v2-inference-v.yaml as {output_dir}/last.yaml') - shutil.copy( - f'./v2_inference/v2-inference-v.yaml', - f'{output_dir}/last.yaml', - ) - elif v2: - print(f'Saving v2-inference.yaml as {output_dir}/last.yaml') - shutil.copy( - f'./v2_inference/v2-inference.yaml', - f'{output_dir}/last.yaml', - ) - - # create caption json file - if generate_caption_database: - if not os.path.exists(train_dir): - os.mkdir(train_dir) - - for root, dirs, files in os.walk(image_folder): - for dir in dirs: - print(os.path.join(root, dir)) - - run_cmd = ( - f'./venv/Scripts/python.exe finetune/merge_captions_to_metadata.py' - ) - if caption_extension == '': - run_cmd += f' --caption_extension=".txt"' - else: - run_cmd += f' --caption_extension={caption_extension}' - run_cmd += f' "{os.path.join(root, dir)}"' - run_cmd += f' "{train_dir}/{caption_metadata_filename}"' - if full_path: - run_cmd += f' --full_path' - - print(run_cmd) - - # Run the command - subprocess.run(run_cmd) - - # create images buckets - if generate_image_buckets: - run_cmd = ( - f'./venv/Scripts/python.exe finetune/prepare_buckets_latents.py' - ) - run_cmd += f' "crap"' - run_cmd += f' "{train_dir}/{caption_metadata_filename}"' - run_cmd += f' "{train_dir}/{latent_metadata_filename}"' - run_cmd += f' "{pretrained_model_name_or_path}"' - run_cmd += f' --batch_size={batch_size}' - run_cmd += f' --max_resolution={max_resolution}' - run_cmd += f' --min_bucket_reso={min_bucket_reso}' - run_cmd += f' --max_bucket_reso={max_bucket_reso}' - run_cmd += f' --mixed_precision={mixed_precision}' - if flip_aug: - run_cmd += f' --flip_aug' - if full_path: - run_cmd += f' --full_path' - - print(run_cmd) - - # Run the command - subprocess.run(run_cmd) - - image_num = 0 - for root, dirs, files in os.walk(image_folder): - for dir in dirs: - image_num += len( - [f for f in os.listdir(os.path.join(root, dir)) if f.endswith('.npz')] - ) - print(f'image_num = {image_num}') - - repeats = int(image_num) * int(dataset_repeats) - print(f'repeats = {str(repeats)}') - - # calculate max_train_steps - max_train_steps = int( - math.ceil(float(repeats) / int(train_batch_size) * int(epoch)) - ) - - # Divide by two because flip augmentation create two copied of the source images - if flip_aug: - max_train_steps = int(math.ceil(float(max_train_steps) / 2)) - - print(f'max_train_steps = {max_train_steps}') - - lr_warmup_steps = round(float(int(lr_warmup) * int(max_train_steps) / 100)) - print(f'lr_warmup_steps = {lr_warmup_steps}') - - run_cmd = f'accelerate launch --num_cpu_threads_per_process={num_cpu_threads_per_process} "./fine_tune.py"' - if v2: - run_cmd += ' --v2' - if v_parameterization: - run_cmd += ' --v_parameterization' - if train_text_encoder: - run_cmd += ' --train_text_encoder' - if use_8bit_adam: - run_cmd += f' --use_8bit_adam' - if xformers: - run_cmd += f' --xformers' - run_cmd += ( - f' --pretrained_model_name_or_path="{pretrained_model_name_or_path}"' - ) - run_cmd += f' --in_json="{train_dir}/{latent_metadata_filename}"' - run_cmd += f' --train_data_dir="{image_folder}"' - run_cmd += f' --output_dir="{output_dir}"' - if not logging_dir == '': - run_cmd += f' --logging_dir="{logging_dir}"' - run_cmd += f' --train_batch_size={train_batch_size}' - run_cmd += f' --dataset_repeats={dataset_repeats}' - run_cmd += f' --learning_rate={learning_rate}' - run_cmd += f' --lr_scheduler={lr_scheduler}' - run_cmd += f' --lr_warmup_steps={lr_warmup_steps}' - run_cmd += f' --max_train_steps={max_train_steps}' - run_cmd += f' --mixed_precision={mixed_precision}' - run_cmd += f' --save_every_n_epochs={save_every_n_epochs}' - run_cmd += f' --seed={seed}' - run_cmd += f' --save_precision={save_precision}' - if not save_model_as == 'same as source model': - run_cmd += f' --save_model_as={save_model_as}' - if int(clip_skip) > 1: - run_cmd += f' --clip_skip={str(clip_skip)}' - - print(run_cmd) - # Run the command - subprocess.run(run_cmd) - - # check if output_dir/last is a folder... therefore it is a diffuser model - last_dir = pathlib.Path(f'{output_dir}/last') - - if not last_dir.is_dir(): - # Copy inference model for v2 if required - save_inference_file(output_dir, v2, v_parameterization) - - -def set_pretrained_model_name_or_path_input(value, v2, v_parameterization): - # define a list of substrings to search for - substrings_v2 = [ - 'stabilityai/stable-diffusion-2-1-base', - 'stabilityai/stable-diffusion-2-base', - ] - - # check if $v2 and $v_parameterization are empty and if $pretrained_model_name_or_path contains any of the substrings in the v2 list - if str(value) in substrings_v2: - print('SD v2 model detected. Setting --v2 parameter') - v2 = True - v_parameterization = False - - return value, v2, v_parameterization - - # define a list of substrings to search for v-objective - substrings_v_parameterization = [ - 'stabilityai/stable-diffusion-2-1', - 'stabilityai/stable-diffusion-2', - ] - - # check if $v2 and $v_parameterization are empty and if $pretrained_model_name_or_path contains any of the substrings in the v_parameterization list - if str(value) in substrings_v_parameterization: - print( - 'SD v2 v_parameterization detected. Setting --v2 parameter and --v_parameterization' - ) - v2 = True - v_parameterization = True - - return value, v2, v_parameterization - - # define a list of substrings to v1.x - substrings_v1_model = [ - 'CompVis/stable-diffusion-v1-4', - 'runwayml/stable-diffusion-v1-5', - ] - - if str(value) in substrings_v1_model: - v2 = False - v_parameterization = False - - return value, v2, v_parameterization - - if value == 'custom': - value = '' - v2 = False - v_parameterization = False - - return value, v2, v_parameterization - - -def remove_doublequote(file_path): - if file_path != None: - file_path = file_path.replace('"', '') - - return file_path - - -def UI(username, password): - - css = '' - - if os.path.exists('./style.css'): - with open(os.path.join('./style.css'), 'r', encoding='utf8') as file: - print('Load CSS...') - css += file.read() + '\n' - - interface = gr.Blocks(css=css) - - with interface: - with gr.Tab('Finetune'): - finetune_tab() - with gr.Tab('Utilities'): - utilities_tab(enable_dreambooth_tab=False) - - # Show the interface - if not username == '': - interface.launch(auth=(username, password)) - else: - interface.launch() - - -def finetune_tab(): - dummy_ft_true = gr.Label(value=True, visible=False) - dummy_ft_false = gr.Label(value=False, visible=False) - gr.Markdown('Train a custom model using kohya finetune python code...') - with gr.Accordion('Configuration file', open=False): - with gr.Row(): - button_open_config = gr.Button( - f'Open {folder_symbol}', elem_id='open_folder' - ) - button_save_config = gr.Button( - f'Save {save_style_symbol}', elem_id='open_folder' - ) - button_save_as_config = gr.Button( - f'Save as... {save_style_symbol}', - elem_id='open_folder', - ) - config_file_name = gr.Textbox( - label='', placeholder='type file path or use buttons...' - ) - config_file_name.change( - remove_doublequote, - inputs=[config_file_name], - outputs=[config_file_name], - ) - with gr.Tab('Source model'): - # Define the input elements - with gr.Row(): - pretrained_model_name_or_path_input = gr.Textbox( - label='Pretrained model name or path', - placeholder='enter the path to custom model or name of pretrained model', - ) - pretrained_model_name_or_path_file = gr.Button( - document_symbol, elem_id='open_folder_small' - ) - pretrained_model_name_or_path_file.click( - get_any_file_path, - inputs=pretrained_model_name_or_path_input, - outputs=pretrained_model_name_or_path_input, - ) - pretrained_model_name_or_path_folder = gr.Button( - folder_symbol, elem_id='open_folder_small' - ) - pretrained_model_name_or_path_folder.click( - get_folder_path, - inputs=pretrained_model_name_or_path_input, - outputs=pretrained_model_name_or_path_input, - ) - model_list = gr.Dropdown( - label='(Optional) Model Quick Pick', - choices=[ - 'custom', - 'stabilityai/stable-diffusion-2-1-base', - 'stabilityai/stable-diffusion-2-base', - 'stabilityai/stable-diffusion-2-1', - 'stabilityai/stable-diffusion-2', - 'runwayml/stable-diffusion-v1-5', - 'CompVis/stable-diffusion-v1-4', - ], - ) - save_model_as_dropdown = gr.Dropdown( - label='Save trained model as', - choices=[ - 'same as source model', - 'ckpt', - 'diffusers', - 'diffusers_safetensors', - 'safetensors', - ], - value='same as source model', - ) - - with gr.Row(): - v2_input = gr.Checkbox(label='v2', value=True) - v_parameterization_input = gr.Checkbox( - label='v_parameterization', value=False - ) - model_list.change( - set_pretrained_model_name_or_path_input, - inputs=[model_list, v2_input, v_parameterization_input], - outputs=[ - pretrained_model_name_or_path_input, - v2_input, - v_parameterization_input, - ], - ) - with gr.Tab('Folders'): - with gr.Row(): - train_dir_input = gr.Textbox( - label='Training config folder', - placeholder='folder where the training configuration files will be saved', - ) - train_dir_folder = gr.Button( - folder_symbol, elem_id='open_folder_small' - ) - train_dir_folder.click(get_folder_path, outputs=train_dir_input) - - image_folder_input = gr.Textbox( - label='Training Image folder', - placeholder='folder where the training images are located', - ) - image_folder_input_folder = gr.Button( - folder_symbol, elem_id='open_folder_small' - ) - image_folder_input_folder.click( - get_folder_path, outputs=image_folder_input - ) - with gr.Row(): - output_dir_input = gr.Textbox( - label='Output folder', - placeholder='folder where the model will be saved', - ) - output_dir_input_folder = gr.Button( - folder_symbol, elem_id='open_folder_small' - ) - output_dir_input_folder.click( - get_folder_path, outputs=output_dir_input - ) - - logging_dir_input = gr.Textbox( - label='Logging folder', - placeholder='Optional: enable logging and output TensorBoard log to this folder', - ) - logging_dir_input_folder = gr.Button( - folder_symbol, elem_id='open_folder_small' - ) - logging_dir_input_folder.click( - get_folder_path, outputs=logging_dir_input - ) - train_dir_input.change( - remove_doublequote, - inputs=[train_dir_input], - outputs=[train_dir_input], - ) - image_folder_input.change( - remove_doublequote, - inputs=[image_folder_input], - outputs=[image_folder_input], - ) - output_dir_input.change( - remove_doublequote, - inputs=[output_dir_input], - outputs=[output_dir_input], - ) - with gr.Tab('Dataset preparation'): - with gr.Row(): - max_resolution_input = gr.Textbox( - label='Resolution (width,height)', value='512,512' - ) - min_bucket_reso = gr.Textbox( - label='Min bucket resolution', value='256' - ) - max_bucket_reso = gr.Textbox( - label='Max bucket resolution', value='1024' - ) - batch_size = gr.Textbox(label='Batch size', value='1') - with gr.Accordion('Advanced parameters', open=False): - with gr.Row(): - caption_metadata_filename = gr.Textbox( - label='Caption metadata filename', value='meta_cap.json' - ) - latent_metadata_filename = gr.Textbox( - label='Latent metadata filename', value='meta_lat.json' - ) - full_path = gr.Checkbox(label='Use full path', value=True) - flip_aug = gr.Checkbox(label='Flip augmentation', value=False) - with gr.Tab('Training parameters'): - with gr.Row(): - learning_rate_input = gr.Textbox(label='Learning rate', value=1e-6) - lr_scheduler_input = gr.Dropdown( - label='LR Scheduler', - choices=[ - 'constant', - 'constant_with_warmup', - 'cosine', - 'cosine_with_restarts', - 'linear', - 'polynomial', - ], - value='constant', - ) - lr_warmup_input = gr.Textbox(label='LR warmup', value=0) - with gr.Row(): - dataset_repeats_input = gr.Textbox( - label='Dataset repeats', value=40 - ) - train_batch_size_input = gr.Slider( - minimum=1, - maximum=32, - label='Train batch size', - value=1, - step=1, - ) - epoch_input = gr.Textbox(label='Epoch', value=1) - save_every_n_epochs_input = gr.Textbox( - label='Save every N epochs', value=1 - ) - with gr.Row(): - mixed_precision_input = gr.Dropdown( - label='Mixed precision', - choices=[ - 'no', - 'fp16', - 'bf16', - ], - value='fp16', - ) - save_precision_input = gr.Dropdown( - label='Save precision', - choices=[ - 'float', - 'fp16', - 'bf16', - ], - value='fp16', - ) - num_cpu_threads_per_process_input = gr.Slider( - minimum=1, - maximum=os.cpu_count(), - step=1, - label='Number of CPU threads per process', - value=os.cpu_count(), - ) - seed_input = gr.Textbox(label='Seed', value=1234) - with gr.Row(): - caption_extention_input = gr.Textbox( - label='Caption Extension', - placeholder='(Optional) Extension for caption files. default: .txt', - ) - train_text_encoder_input = gr.Checkbox( - label='Train text encoder', value=True - ) - with gr.Accordion('Advanced parameters', open=False): - with gr.Row(): - use_8bit_adam = gr.Checkbox(label='Use 8bit adam', value=True) - xformers = gr.Checkbox(label='Use xformers', value=True) - clip_skip = gr.Slider( - label='Clip skip', value='1', minimum=1, maximum=12, step=1 - ) - with gr.Box(): - with gr.Row(): - create_caption = gr.Checkbox( - label='Generate caption metadata', value=True - ) - create_buckets = gr.Checkbox( - label='Generate image buckets metadata', value=True - ) - - button_run = gr.Button('Train model') - - settings_list = [ - pretrained_model_name_or_path_input, - v2_input, - v_parameterization_input, - train_dir_input, - image_folder_input, - output_dir_input, - logging_dir_input, - max_resolution_input, - min_bucket_reso, - max_bucket_reso, - batch_size, - flip_aug, - caption_metadata_filename, - latent_metadata_filename, - full_path, - learning_rate_input, - lr_scheduler_input, - lr_warmup_input, - dataset_repeats_input, - train_batch_size_input, - epoch_input, - save_every_n_epochs_input, - mixed_precision_input, - save_precision_input, - seed_input, - num_cpu_threads_per_process_input, - train_text_encoder_input, - create_caption, - create_buckets, - save_model_as_dropdown, - caption_extention_input, - use_8bit_adam, - xformers, - clip_skip, - ] - - button_run.click(train_model, inputs=settings_list) - - button_open_config.click( - open_config_file, - inputs=[config_file_name] + settings_list, - outputs=[config_file_name] + settings_list, - ) - - button_save_config.click( - save_configuration, - inputs=[dummy_ft_false, config_file_name] + settings_list, - outputs=[config_file_name], - ) - - button_save_as_config.click( - save_configuration, - inputs=[dummy_ft_true, config_file_name] + settings_list, - outputs=[config_file_name], - ) - - -if __name__ == '__main__': - # torch.cuda.set_per_process_memory_fraction(0.48) - parser = argparse.ArgumentParser() - parser.add_argument( - '--username', type=str, default='', help='Username for authentication' - ) - parser.add_argument( - '--password', type=str, default='', help='Password for authentication' - ) - - args = parser.parse_args() - - UI(username=args.username, password=args.password) diff --git a/finetune_gui.py b/finetune_gui.py index 6abadbf..2459058 100644 --- a/finetune_gui.py +++ b/finetune_gui.py @@ -13,6 +13,7 @@ from library.common_gui import ( get_saveasfile_path, save_inference_file, set_pretrained_model_name_or_path_input, + gradio_advanced_training,run_cmd_advanced_training ) from library.utilities import utilities_tab @@ -67,6 +68,8 @@ def save_configuration( shuffle_caption, output_name, max_token_length, + max_train_epochs, + max_data_loader_n_workers, ): # Get list of function parameters and values parameters = list(locals().items()) @@ -150,6 +153,8 @@ def open_config_file( shuffle_caption, output_name, max_token_length, + max_train_epochs, + max_data_loader_n_workers, ): # Get list of function parameters and values parameters = list(locals().items()) @@ -218,6 +223,8 @@ def train_model( shuffle_caption, output_name, max_token_length, + max_train_epochs, + max_data_loader_n_workers, ): # create caption json file if generate_caption_database: @@ -336,6 +343,7 @@ def train_model( run_cmd += f' --output_name="{output_name}"' if (int(max_token_length) > 75): run_cmd += f' --max_token_length={max_token_length}' + run_cmd += run_cmd_advanced_training(max_train_epochs=max_train_epochs, max_data_loader_n_workers=max_data_loader_n_workers) print(run_cmd) # Run the command @@ -656,6 +664,7 @@ def finetune_tab(): ], value='75', ) + max_train_epochs, max_data_loader_n_workers = gradio_advanced_training() with gr.Box(): with gr.Row(): create_caption = gr.Checkbox( @@ -710,6 +719,8 @@ def finetune_tab(): shuffle_caption, output_name, max_token_length, + max_train_epochs, + max_data_loader_n_workers, ] button_run.click(train_model, inputs=settings_list) diff --git a/gen_img_diffusers.py b/gen_img_diffusers.py index edc007e..4edfe0b 100644 --- a/gen_img_diffusers.py +++ b/gen_img_diffusers.py @@ -1,38 +1,3 @@ -# txt2img with Diffusers: supports SD checkpoints, EulerScheduler, clip-skip, 225 tokens, Hypernetwork etc... - -# v2: CLIP guided Stable Diffusion, Image guided Stable Diffusion, highres. fix -# v3: Add dpmsolver/dpmsolver++, add VAE loading, add upscale, add 'bf16', fix the issue network_mul is not working -# v4: SD2.0 support (new U-Net/text encoder/tokenizer), simplify by DiffUsers 0.9.0, no_preview in interactive mode -# v5: fix clip_sample=True for scheduler, add VGG guidance -# v6: refactor to use model util, load VAE without vae folder, support safe tensors -# v7: add use_original_file_name and iter_same_seed option, change vgg16 guide input image size, -# Diffusers 0.10.0 (support new schedulers (dpm_2, dpm_2_a, heun, dpmsingle), supports all scheduler in v-prediction) -# v8: accept wildcard for ckpt name (when only one file is matched), fix a bug app crushes because PIL image doesn't have filename attr sometimes, -# v9: sort file names, fix an issue in img2img when prompt from metadata with images_per_prompt>1 -# v10: fix app crashes when different image size in prompts - -# Copyright 2022 kohya_ss @kohya_ss -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# license of included scripts: - -# FlashAttention: based on https://github.com/lucidrains/memory-efficient-attention-pytorch/blob/main/memory_efficient_attention_pytorch/flash_attention.py -# MIT https://github.com/lucidrains/memory-efficient-attention-pytorch/blob/main/LICENSE - -# Diffusers (model conversion, CLIP guided stable diffusion, schedulers etc.): -# ASL 2.0 https://github.com/huggingface/diffusers/blob/main/LICENSE - """ VGG( (features): Sequential( @@ -81,11 +46,13 @@ VGG( ) """ +import json from typing import List, Optional, Union import glob import importlib import inspect import time +import zipfile from diffusers.utils import deprecate from diffusers.configuration_utils import FrozenDict import argparse @@ -517,7 +484,7 @@ class PipelineLike(): self.vgg16_feat_model = torchvision.models._utils.IntermediateLayerGetter(vgg16_model.features, return_layers=return_layers) self.vgg16_normalize = transforms.Normalize(mean=VGG16_IMAGE_MEAN, std=VGG16_IMAGE_STD) -# region xformersとか使う部分:独自に書き換えるので関係なし + # region xformersとか使う部分:独自に書き換えるので関係なし def enable_xformers_memory_efficient_attention(self): r""" Enable memory efficient attention as implemented in xformers. @@ -590,6 +557,7 @@ class PipelineLike(): width: int = 512, num_inference_steps: int = 50, guidance_scale: float = 7.5, + negative_scale: float = None, strength: float = 0.8, # num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, @@ -708,6 +676,11 @@ class PipelineLike(): # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` # corresponds to doing no classifier free guidance. do_classifier_free_guidance = guidance_scale > 1.0 + + if not do_classifier_free_guidance and negative_scale is not None: + print(f"negative_scale is ignored if guidance scalle <= 1.0") + negative_scale = None + # get unconditional embeddings for classifier free guidance if negative_prompt is None: negative_prompt = [""] * batch_size @@ -729,8 +702,21 @@ class PipelineLike(): **kwargs, ) + if negative_scale is not None: + _, real_uncond_embeddings, _ = get_weighted_text_embeddings( + pipe=self, + prompt=prompt, # こちらのトークン長に合わせてuncondを作るので75トークン超で必須 + uncond_prompt=[""]*batch_size, + max_embeddings_multiples=max_embeddings_multiples, + clip_skip=self.clip_skip, + **kwargs, + ) + if do_classifier_free_guidance: - text_embeddings = torch.cat([uncond_embeddings, text_embeddings]) + if negative_scale is None: + text_embeddings = torch.cat([uncond_embeddings, text_embeddings]) + else: + text_embeddings = torch.cat([uncond_embeddings, text_embeddings, real_uncond_embeddings]) # CLIP guidanceで使用するembeddingsを取得する if self.clip_guidance_scale > 0: @@ -861,22 +847,28 @@ class PipelineLike(): if accepts_eta: extra_step_kwargs["eta"] = eta + num_latent_input = (3 if negative_scale is not None else 2) if do_classifier_free_guidance else 1 for i, t in enumerate(tqdm(timesteps)): # expand the latents if we are doing classifier free guidance - latent_model_input = latents.repeat((2, 1, 1, 1)) if do_classifier_free_guidance else latents + latent_model_input = latents.repeat((num_latent_input, 1, 1, 1)) latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) - # predict the noise residual noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample # perform guidance if do_classifier_free_guidance: - noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) - noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) + if negative_scale is None: + noise_pred_uncond, noise_pred_text = noise_pred.chunk(num_latent_input) # uncond by negative prompt + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) + else: + noise_pred_negative, noise_pred_text, noise_pred_uncond = noise_pred.chunk(num_latent_input) # uncond is real uncond + noise_pred = noise_pred_uncond + guidance_scale * \ + (noise_pred_text - noise_pred_uncond) - negative_scale * (noise_pred_negative - noise_pred_uncond) # perform clip guidance if self.clip_guidance_scale > 0 or self.clip_image_guidance_scale > 0 or self.vgg16_guidance_scale > 0: - text_embeddings_for_guidance = (text_embeddings.chunk(2)[1] if do_classifier_free_guidance else text_embeddings) + text_embeddings_for_guidance = (text_embeddings.chunk(num_latent_input)[ + 1] if do_classifier_free_guidance else text_embeddings) if self.clip_guidance_scale > 0: noise_pred, latents = self.cond_fn(latents, t, i, text_embeddings_for_guidance, noise_pred, @@ -1982,26 +1974,50 @@ def main(args): vgg16_model.to(dtype).to(device) # networkを組み込む - if args.network_module is not None: - # assert not args.diffusers_xformers, "cannot use network with diffusers_xformers / diffusers_xformers指定時はnetworkは利用できません" + if args.network_module: + networks = [] + for i, network_module in enumerate(args.network_module): + print("import network module:", network_module) + imported_module = importlib.import_module(network_module) - print("import network module:", args.network_module) - network_module = importlib.import_module(args.network_module) + network_mul = 1.0 if args.network_mul is None or len(args.network_mul) <= i else args.network_mul[i] + network_dim = None if args.network_dim is None or len(args.network_dim) <= i else args.network_dim[i] - network = network_module.create_network(args.network_mul, args.network_dim, vae,text_encoder, unet) # , **net_kwargs) - if network is None: - return + net_kwargs = {} + if args.network_args and i < len(args.network_args): + network_args = args.network_args[i] + # TODO escape special chars + network_args = network_args.split(";") + for net_arg in network_args: + key, value = net_arg.split("=") + net_kwargs[key] = value - print("load network weights from:", args.network_weights) - network.load_weights(args.network_weights) + network = imported_module.create_network(network_mul, network_dim, vae, text_encoder, unet, **net_kwargs) + if network is None: + return - network.apply_to(text_encoder, unet) + if args.network_weights and i < len(args.network_weights): + network_weight = args.network_weights[i] + print("load network weights from:", network_weight) - if args.opt_channels_last: - network.to(memory_format=torch.channels_last) - network.to(dtype).to(device) + if os.path.splitext(network_weight)[1] == '.safetensors': + from safetensors.torch import safe_open + with safe_open(network_weight, framework="pt") as f: + metadata = f.metadata() + if metadata is not None: + print(f"metadata for: {network_weight}: {metadata}") + + network.load_weights(network_weight) + + network.apply_to(text_encoder, unet) + + if args.opt_channels_last: + network.to(memory_format=torch.channels_last) + network.to(dtype).to(device) + + networks.append(network) else: - network = None + networks = [] if args.opt_channels_last: print(f"set optimizing: channels last") @@ -2010,8 +2026,9 @@ def main(args): unet.to(memory_format=torch.channels_last) if clip_model is not None: clip_model.to(memory_format=torch.channels_last) - if network is not None: - network.to(memory_format=torch.channels_last) + if networks: + for network in networks: + network.to(memory_format=torch.channels_last) if vgg16_model is not None: vgg16_model.to(memory_format=torch.channels_last) @@ -2053,7 +2070,7 @@ def main(args): print(f"convert image to RGB from {image.mode}: {p}") image = image.convert("RGB") images.append(image) - + return images def resize_images(imgs, size): @@ -2154,12 +2171,12 @@ def main(args): # 1st stageのバッチを作成して呼び出す print("process 1st stage1") batch_1st = [] - for params1, (width, height, steps, scale, strength) in batch: + for params1, (width, height, steps, scale, negative_scale, strength) in batch: width_1st = int(width * args.highres_fix_scale + .5) height_1st = int(height * args.highres_fix_scale + .5) width_1st = width_1st - width_1st % 32 height_1st = height_1st - height_1st % 32 - batch_1st.append((params1, (width_1st, height_1st, args.highres_fix_steps, scale, strength))) + batch_1st.append((params1, (width_1st, height_1st, args.highres_fix_steps, scale, negative_scale, strength))) images_1st = process_batch(batch_1st, True, True) # 2nd stageのバッチを作成して以下処理する @@ -2171,7 +2188,8 @@ def main(args): batch_2nd.append(((step, prompt, negative_prompt, seed+1, image, None, clip_prompt, guide_image), params2)) batch = batch_2nd - (step_first, _, _, _, init_image, mask_image, _, guide_image), (width, height, steps, scale, strength) = batch[0] + (step_first, _, _, _, init_image, mask_image, _, guide_image), (width, + height, steps, scale, negative_scale, strength) = batch[0] noise_shape = (LATENT_CHANNELS, height // DOWNSAMPLING_FACTOR, width // DOWNSAMPLING_FACTOR) prompts = [] @@ -2247,7 +2265,7 @@ def main(args): guide_images = guide_images[0] # generate - images = pipe(prompts, negative_prompts, init_images, mask_images, height, width, steps, scale, strength, latents=start_code, + images = pipe(prompts, negative_prompts, init_images, mask_images, height, width, steps, scale, negative_scale, strength, latents=start_code, output_type='pil', max_embeddings_multiples=max_embeddings_multiples, img2img_noise=i2i_noises, clip_prompts=clip_prompts, clip_guide_images=guide_images)[0] if highres_1st and not args.highres_fix_save_1st: return images @@ -2264,6 +2282,8 @@ def main(args): metadata.add_text("scale", str(scale)) if negative_prompt is not None: metadata.add_text("negative-prompt", negative_prompt) + if negative_scale is not None: + metadata.add_text("negative-scale", str(negative_scale)) if clip_prompt is not None: metadata.add_text("clip-prompt", clip_prompt) @@ -2316,6 +2336,7 @@ def main(args): width = args.W height = args.H scale = args.scale + negative_scale = args.negative_scale steps = args.steps seeds = None strength = 0.8 if args.strength is None else args.strength @@ -2358,6 +2379,15 @@ def main(args): print(f"scale: {scale}") continue + m = re.match(r'nl ([\d\.]+|none|None)', parg, re.IGNORECASE) + if m: # negative scale + if m.group(1).lower() == 'none': + negative_scale = None + else: + negative_scale = float(m.group(1)) + print(f"negative scale: {negative_scale}") + continue + m = re.match(r't ([\d\.]+)', parg, re.IGNORECASE) if m: # strength strength = float(m.group(1)) @@ -2420,8 +2450,9 @@ def main(args): print("Use previous image as guide image.") guide_image = prev_image + # TODO named tupleか何かにする b1 = ((global_step, prompt, negative_prompt, seed, init_image, mask_image, clip_prompt, guide_image), - (width, height, steps, scale, strength)) + (width, height, steps, scale, negative_scale, strength)) if len(batch_data) > 0 and batch_data[-1][1] != b1[1]: # バッチ分割必要? process_batch(batch_data, highres_fix) batch_data.clear() @@ -2481,19 +2512,24 @@ if __name__ == '__main__': # help="Replace CLIP (Text Encoder) to l/14@336 / CLIP(Text Encoder)をl/14@336に入れ替える") parser.add_argument("--seed", type=int, default=None, help="seed, or seed of seeds in multiple generation / 1枚生成時のseed、または複数枚生成時の乱数seedを決めるためのseed") - parser.add_argument("--iter_same_seed", action='store_true', help='use same seed for all prompts in iteration if no seed specified / 乱数seedの指定がないとき繰り返し内はすべて同じseedを使う(プロンプト間の差異の比較用)') + parser.add_argument("--iter_same_seed", action='store_true', + help='use same seed for all prompts in iteration if no seed specified / 乱数seedの指定がないとき繰り返し内はすべて同じseedを使う(プロンプト間の差異の比較用)') parser.add_argument("--fp16", action='store_true', help='use fp16 / fp16を指定し省メモリ化する') parser.add_argument("--bf16", action='store_true', help='use bfloat16 / bfloat16を指定し省メモリ化する') parser.add_argument("--xformers", action='store_true', help='use xformers / xformersを使用し高速化する') parser.add_argument("--diffusers_xformers", action='store_true', - help='use xformers by diffusers (Hypernetworks doesn\'t work) / Diffusersでxformersを使用する(Hypernetwork利用不可)') + help='use xformers by diffusers (Hypernetworks doen\'t work) / Diffusersでxformersを使用する(Hypernetwork利用不可)') parser.add_argument("--opt_channels_last", action='store_true', - help='set channels last option to model / モデルにchannels lastを指定し最適化する') - parser.add_argument("--network_module", type=str, default=None, help='Hypernetwork module to use / Hypernetworkを使う時そのモジュール名') - parser.add_argument("--network_weights", type=str, default=None, help='Hypernetwork weights to load / Hypernetworkの重み') - parser.add_argument("--network_mul", type=float, default=1.0, help='Hypernetwork multiplier / Hypernetworkの効果の倍率') - parser.add_argument("--network_dim", type=int, default=None, + help='set channels last option to model / モデルにchannles lastを指定し最適化する') + parser.add_argument("--network_module", type=str, default=None, nargs='*', + help='Hypernetwork module to use / Hypernetworkを使う時そのモジュール名') + parser.add_argument("--network_weights", type=str, default=None, nargs='*', + help='Hypernetwork weights to load / Hypernetworkの重み') + parser.add_argument("--network_mul", type=float, default=None, nargs='*', help='Hypernetwork multiplier / Hypernetworkの効果の倍率') + parser.add_argument("--network_dim", type=int, default=None, nargs='*', help='network dimensions (depends on each network) / モジュールの次元数(ネットワークにより定義は異なります)') + parser.add_argument("--network_args", type=str, default=None, nargs='*', + help='additional argmuments for network (key=value) / ネットワークへの追加の引数') parser.add_argument("--clip_skip", type=int, default=None, help='layer number from bottom to use in CLIP / CLIPの後ろからn層目の出力を使う') parser.add_argument("--max_embeddings_multiples", type=int, default=None, help='max embeding multiples, max token length is 75 * multiples / トークン長をデフォルトの何倍とするか 75*この値 がトークン長となる') @@ -2512,6 +2548,8 @@ if __name__ == '__main__': help="1st stage steps for highres fix / highres fixの最初のステージのステップ数") parser.add_argument("--highres_fix_save_1st", action='store_true', help="save 1st stage images for highres fix / highres fixの最初のステージの画像を保存する") + parser.add_argument("--negative_scale", type=float, default=None, + help="set another guidance scale for negative prompt / ネガティブプロンプトのscaleを指定する") args = parser.parse_args() - main(args) \ No newline at end of file + main(args) diff --git a/library/common_gui.py b/library/common_gui.py index 76c3a13..167bc00 100644 --- a/library/common_gui.py +++ b/library/common_gui.py @@ -261,4 +261,31 @@ def set_pretrained_model_name_or_path_input(value, v2, v_parameterization): v2 = False v_parameterization = False - return value, v2, v_parameterization \ No newline at end of file + return value, v2, v_parameterization + + ### + ### Gradio common GUI section + ### + +def gradio_advanced_training(): + with gr.Row(): + max_train_epochs = gr.Textbox( + label='Max train epoch', + placeholder='(Optional) Override number of epoch', + ) + max_data_loader_n_workers = gr.Textbox( + label='Max num workers for DataLoader', + placeholder='(Optional) Override number of epoch. Default: 8', + ) + return max_train_epochs, max_data_loader_n_workers + +def run_cmd_advanced_training(**kwargs): + run_cmd = '' + max_train_epochs = kwargs.get('max_train_epochs', '') + max_data_loader_n_workers = kwargs.get('max_data_loader_n_workers', '') + if not max_train_epochs == '': + run_cmd += f' --max_train_epochs="{max_train_epochs}"' + if not max_data_loader_n_workers == '': + run_cmd += f' --max_data_loader_n_workers="{max_data_loader_n_workers}"' + + return run_cmd \ No newline at end of file diff --git a/library/model_util.py b/library/model_util.py index bc824a1..6a1e656 100644 --- a/library/model_util.py +++ b/library/model_util.py @@ -632,7 +632,7 @@ def convert_ldm_clip_checkpoint_v2(checkpoint, max_length): del new_sd[ANOTHER_POSITION_IDS_KEY] else: position_ids = torch.Tensor([list(range(max_length))]).to(torch.int64) - + new_sd["text_model.embeddings.position_ids"] = position_ids return new_sd @@ -886,7 +886,7 @@ def load_models_from_stable_diffusion_checkpoint(v2, ckpt_path, dtype=None): vae = AutoencoderKL(**vae_config) info = vae.load_state_dict(converted_vae_checkpoint) - print("loadint vae:", info) + print("loading vae:", info) # convert text_model if v2: @@ -1105,12 +1105,12 @@ def load_vae(vae_id, dtype): if vae_id.endswith(".bin"): # SD 1.5 VAE on Huggingface - vae_sd = torch.load(vae_id, map_location="cpu") - converted_vae_checkpoint = vae_sd + converted_vae_checkpoint = torch.load(vae_id, map_location="cpu") else: # StableDiffusion - vae_model = torch.load(vae_id, map_location="cpu") - vae_sd = vae_model['state_dict'] + vae_model = (load_file(vae_id, "cpu") if is_safetensors(vae_id) + else torch.load(vae_id, map_location="cpu")) + vae_sd = vae_model['state_dict'] if 'state_dict' in vae_model else vae_model # vae only or full model full_model = False @@ -1132,7 +1132,6 @@ def load_vae(vae_id, dtype): vae.load_state_dict(converted_vae_checkpoint) return vae - # endregion diff --git a/library/train_util.py b/library/train_util.py index bad954c..57ebf1b 100644 --- a/library/train_util.py +++ b/library/train_util.py @@ -715,7 +715,10 @@ def debug_dataset(train_dataset): def glob_images(dir, base): img_paths = [] for ext in IMAGE_EXTENSIONS: - img_paths.extend(glob.glob(os.path.join(dir, base + ext))) + if base == '*': + img_paths.extend(glob.glob(os.path.join(glob.escape(dir), base + ext))) + else: + img_paths.extend(glob.glob(glob.escape(os.path.join(dir, base + ext)))) return img_paths # endregion @@ -744,6 +747,20 @@ def exists(val): def default(val, d): return val if exists(val) else d + +def model_hash(filename): + try: + with open(filename, "rb") as file: + import hashlib + m = hashlib.sha256() + + file.seek(0x100000) + m.update(file.read(0x10000)) + return m.hexdigest()[0:8] + except FileNotFoundError: + return 'NOFILE' + + # flash attention forwards and backwards # https://arxiv.org/abs/2205.14135 @@ -1030,6 +1047,8 @@ def add_training_arguments(parser: argparse.ArgumentParser, support_dreambooth: parser.add_argument("--learning_rate", type=float, default=2.0e-6, help="learning rate / 学習率") parser.add_argument("--max_train_steps", type=int, default=1600, help="training steps / 学習ステップ数") + parser.add_argument("--max_train_epochs", type=int, default=None, help="training epochs (overrides max_train_steps) / 学習エポック数(max_train_stepsを上書きします)") + parser.add_argument("--max_data_loader_n_workers", type=int, default=8, help="max num workers for DataLoader (lower is less main RAM usage, faster epoch start and slower data loading) / DataLoaderの最大プロセス数(小さい値ではメインメモリの使用量が減りエポック間の待ち時間が減りますが、データ読み込みは遅くなります)") parser.add_argument("--seed", type=int, default=None, help="random seed for training / 学習時の乱数のseed") parser.add_argument("--gradient_checkpointing", action="store_true", help="enable gradient checkpointing / grandient checkpointingを有効にする") diff --git a/lora_gui.py b/lora_gui.py index 4c0c51f..676f14d 100644 --- a/lora_gui.py +++ b/lora_gui.py @@ -19,7 +19,7 @@ from library.common_gui import ( get_saveasfile_path, color_aug_changed, save_inference_file, - set_pretrained_model_name_or_path_input, + set_pretrained_model_name_or_path_input, gradio_advanced_training,run_cmd_advanced_training, ) from library.dreambooth_folder_creation_gui import ( gradio_dreambooth_folder_creation_tab, @@ -81,6 +81,8 @@ def save_configuration( output_name, model_list, max_token_length, + max_train_epochs, + max_data_loader_n_workers, ): # Get list of function parameters and values parameters = list(locals().items()) @@ -165,6 +167,8 @@ def open_configuration( output_name, model_list, max_token_length, + max_train_epochs, + max_data_loader_n_workers, ): # Get list of function parameters and values parameters = list(locals().items()) @@ -233,6 +237,8 @@ def train_model( output_name, model_list, # Keep this. Yes, it is unused here but required given the common list used max_token_length, + max_train_epochs, + max_data_loader_n_workers, ): if pretrained_model_name_or_path == '': msgbox('Source model information is missing') @@ -410,6 +416,7 @@ def train_model( run_cmd += f' --output_name="{output_name}"' if (int(max_token_length) > 75): run_cmd += f' --max_token_length={max_token_length}' + run_cmd += run_cmd_advanced_training(max_train_epochs=max_train_epochs, max_data_loader_n_workers=max_data_loader_n_workers) print(run_cmd) # Run the command @@ -795,6 +802,7 @@ def lora_tab( ], value='75', ) + max_train_epochs, max_data_loader_n_workers = gradio_advanced_training() with gr.Tab('Tools'): gr.Markdown( @@ -854,7 +862,9 @@ def lora_tab( mem_eff_attn, output_name, model_list, - max_token_length + max_token_length, + max_train_epochs, + max_data_loader_n_workers, ] button_open_config.click( diff --git a/networks/extract_lora_from_models.py b/networks/extract_lora_from_models.py index c882e88..0a4c3a0 100644 --- a/networks/extract_lora_from_models.py +++ b/networks/extract_lora_from_models.py @@ -135,7 +135,7 @@ def svd(args): if dir_name and not os.path.exists(dir_name): os.makedirs(dir_name, exist_ok=True) - lora_network_o.save_weights(args.save_to, save_dtype) + lora_network_o.save_weights(args.save_to, save_dtype, {}) print(f"LoRA weights are saved to: {args.save_to}") diff --git a/networks/lora.py b/networks/lora.py index 730a637..3f8244e 100644 --- a/networks/lora.py +++ b/networks/lora.py @@ -92,7 +92,7 @@ class LoRANetwork(torch.nn.Module): def load_weights(self, file): if os.path.splitext(file)[1] == '.safetensors': - from safetensors.torch import load_file + from safetensors.torch import load_file, safe_open self.weights_sd = load_file(file) else: self.weights_sd = torch.load(file, map_location='cpu') @@ -174,7 +174,10 @@ class LoRANetwork(torch.nn.Module): def get_trainable_params(self): return self.parameters() - def save_weights(self, file, dtype): + def save_weights(self, file, dtype, metadata): + if metadata is not None and len(metadata) == 0: + metadata = None + state_dict = self.state_dict() if dtype is not None: @@ -185,6 +188,6 @@ class LoRANetwork(torch.nn.Module): if os.path.splitext(file)[1] == '.safetensors': from safetensors.torch import save_file - save_file(state_dict, file) + save_file(state_dict, file, metadata) else: torch.save(state_dict, file) diff --git a/tools/convert_diffusers20_original_sd.py b/tools/convert_diffusers20_original_sd.py index 3cff283..a3cd03f 100644 --- a/tools/convert_diffusers20_original_sd.py +++ b/tools/convert_diffusers20_original_sd.py @@ -90,4 +90,4 @@ if __name__ == '__main__': help="model to save: checkpoint (with extension) or Diffusers model's directory (without extension) / 変換後のモデル、拡張子がある場合はcheckpoint、ない場合はDiffusesモデルとして保存") args = parser.parse_args() - convert(args) \ No newline at end of file + convert(args) diff --git a/tools/detect_face_rotate.py b/tools/detect_face_rotate.py index ef6d188..4d5e58d 100644 --- a/tools/detect_face_rotate.py +++ b/tools/detect_face_rotate.py @@ -236,4 +236,4 @@ if __name__ == '__main__': parser.add_argument("--debug", action="store_true", help="render rect for face / 処理後画像の顔位置に矩形を描画します") args = parser.parse_args() - process(args) \ No newline at end of file + process(args) diff --git a/train_db.py b/train_db.py index 8c9cdb9..bbef3da 100644 --- a/train_db.py +++ b/train_db.py @@ -134,10 +134,15 @@ def train(args): # dataloaderを準備する # DataLoaderのプロセス数:0はメインプロセスになる - n_workers = min(8, os.cpu_count() - 1) # cpu_count-1 ただし最大8 + n_workers = min(args.max_data_loader_n_workers, os.cpu_count() - 1) # cpu_count-1 ただし最大で指定された数まで train_dataloader = torch.utils.data.DataLoader( train_dataset, batch_size=1, shuffle=False, collate_fn=collate_fn, num_workers=n_workers) + # 学習ステップ数を計算する + if args.max_train_epochs is not None: + args.max_train_steps = args.max_train_epochs * len(train_dataloader) + print(f"override steps. steps for {args.max_train_epochs} epochs is / 指定エポックまでのステップ数: {args.max_train_steps}") + # lr schedulerを用意する lr_scheduler = diffusers.optimization.get_scheduler( args.lr_scheduler, optimizer, num_warmup_steps=args.lr_warmup_steps, num_training_steps=args.max_train_steps) diff --git a/train_network.py b/train_network.py index 9f292b9..c0a881a 100644 --- a/train_network.py +++ b/train_network.py @@ -126,10 +126,15 @@ def train(args): # dataloaderを準備する # DataLoaderのプロセス数:0はメインプロセスになる - n_workers = min(8, os.cpu_count() - 1) # cpu_count-1 ただし最大8 + n_workers = min(args.max_data_loader_n_workers, os.cpu_count() - 1) # cpu_count-1 ただし最大で指定された数まで train_dataloader = torch.utils.data.DataLoader( train_dataset, batch_size=1, shuffle=False, collate_fn=collate_fn, num_workers=n_workers) + # 学習ステップ数を計算する + if args.max_train_epochs is not None: + args.max_train_steps = args.max_train_epochs * len(train_dataloader) + print(f"override steps. steps for {args.max_train_epochs} epochs is / 指定エポックまでのステップ数: {args.max_train_steps}") + # lr schedulerを用意する lr_scheduler = diffusers.optimization.get_scheduler( args.lr_scheduler, optimizer, num_warmup_steps=args.lr_warmup_steps, num_training_steps=args.max_train_steps * args.gradient_accumulation_steps) @@ -194,9 +199,62 @@ def train(args): print(f" num epochs / epoch数: {num_train_epochs}") print(f" batch size per device / バッチサイズ: {args.train_batch_size}") print(f" total train batch size (with parallel & distributed & accumulation) / 総バッチサイズ(並列学習、勾配合計含む): {total_batch_size}") - print(f" gradient ccumulation steps / 勾配を合計するステップ数 = {args.gradient_accumulation_steps}") + print(f" gradient accumulation steps / 勾配を合計するステップ数 = {args.gradient_accumulation_steps}") print(f" total optimization steps / 学習ステップ数: {args.max_train_steps}") + metadata = { + "ss_learning_rate": args.learning_rate, + "ss_text_encoder_lr": args.text_encoder_lr, + "ss_unet_lr": args.unet_lr, + "ss_num_train_images": train_dataset.num_train_images, # includes repeating TODO more detailed data + "ss_num_reg_images": train_dataset.num_reg_images, + "ss_num_batches_per_epoch": len(train_dataloader), + "ss_num_epochs": num_train_epochs, + "ss_batch_size_per_device": args.train_batch_size, + "ss_total_batch_size": total_batch_size, + "ss_gradient_accumulation_steps": args.gradient_accumulation_steps, + "ss_max_train_steps": args.max_train_steps, + "ss_lr_warmup_steps": args.lr_warmup_steps, + "ss_lr_scheduler": args.lr_scheduler, + "ss_network_module": args.network_module, + "ss_network_dim": args.network_dim, # None means default because another network than LoRA may have another default dim + "ss_mixed_precision": args.mixed_precision, + "ss_full_fp16": bool(args.full_fp16), + "ss_v2": bool(args.v2), + "ss_resolution": args.resolution, + "ss_clip_skip": args.clip_skip, + "ss_max_token_length": args.max_token_length, + "ss_color_aug": bool(args.color_aug), + "ss_flip_aug": bool(args.flip_aug), + "ss_random_crop": bool(args.random_crop), + "ss_shuffle_caption": bool(args.shuffle_caption), + "ss_cache_latents": bool(args.cache_latents), + "ss_enable_bucket": bool(train_dataset.enable_bucket), # TODO move to BaseDataset from DB/FT + "ss_min_bucket_reso": args.min_bucket_reso, # TODO get from dataset + "ss_max_bucket_reso": args.max_bucket_reso, + "ss_seed": args.seed + } + + # uncomment if another network is added + # for key, value in net_kwargs.items(): + # metadata["ss_arg_" + key] = value + + if args.pretrained_model_name_or_path is not None: + sd_model_name = args.pretrained_model_name_or_path + if os.path.exists(sd_model_name): + metadata["ss_sd_model_hash"] = train_util.model_hash(sd_model_name) + sd_model_name = os.path.basename(sd_model_name) + metadata["ss_sd_model_name"] = sd_model_name + + if args.vae is not None: + vae_name = args.vae + if os.path.exists(vae_name): + metadata["ss_vae_hash"] = train_util.model_hash(vae_name) + vae_name = os.path.basename(vae_name) + metadata["ss_vae_name"] = vae_name + + metadata = {k: str(v) for k, v in metadata.items()} + progress_bar = tqdm(range(args.max_train_steps), smoothing=0, disable=not accelerator.is_local_main_process, desc="steps") global_step = 0 @@ -208,6 +266,7 @@ def train(args): for epoch in range(num_train_epochs): print(f"epoch {epoch+1}/{num_train_epochs}") + metadata["ss_epoch"] = str(epoch+1) network.on_epoch_start(text_encoder, unet) @@ -296,7 +355,7 @@ def train(args): ckpt_name = train_util.EPOCH_FILE_NAME.format(model_name, epoch + 1) + '.' + args.save_model_as ckpt_file = os.path.join(args.output_dir, ckpt_name) print(f"saving checkpoint: {ckpt_file}") - unwrap_model(network).save_weights(ckpt_file, save_dtype) + unwrap_model(network).save_weights(ckpt_file, save_dtype, None if args.no_metadata else metadata) def remove_old_func(old_epoch_no): old_ckpt_name = train_util.EPOCH_FILE_NAME.format(model_name, old_epoch_no) + '.' + args.save_model_as @@ -311,6 +370,8 @@ def train(args): # end of epoch + metadata["ss_epoch"] = str(num_train_epochs) + is_main_process = accelerator.is_main_process if is_main_process: network = unwrap_model(network) @@ -330,7 +391,7 @@ def train(args): ckpt_file = os.path.join(args.output_dir, ckpt_name) print(f"save trained model to {ckpt_file}") - network.save_weights(ckpt_file, save_dtype) + network.save_weights(ckpt_file, save_dtype, None if args.no_metadata else metadata) print("model saved.") @@ -341,6 +402,7 @@ if __name__ == '__main__': train_util.add_dataset_arguments(parser, True, True) train_util.add_training_arguments(parser, True) + parser.add_argument("--no_metadata", action='store_true', help="do not save metadata in output model / メタデータを出力先モデルに保存しない") parser.add_argument("--save_model_as", type=str, default="pt", choices=[None, "ckpt", "pt", "safetensors"], help="format to save the model (default is .pt) / モデル保存時の形式(デフォルトはpt)")