Spaces:

lint
/

anime_controlnet

Runtime error

App Files Files Community

1lint commited on May 17, 2023

Commit

6230dda

0 Parent(s):

init commit

Browse files

Files changed (22) hide show

.gitignore +173 -0
LICENSE +201 -0
README.md +76 -0
app.py +4 -0
configs/controlnet_config.json +41 -0
convert_state_dict.sh +8 -0
main.py +49 -0
quickstart_train.py +50 -0
requirements.txt +18 -0
src/__init__.py +2 -0
src/app.py +260 -0
src/controlnet_pipe.py +309 -0
src/convert_sd.py +223 -0
src/data.py +149 -0
src/lab.py +474 -0
src/ui_assets/controlnet_ids.txt +4 -0
src/ui_assets/examples +1 -0
src/ui_assets/footer.html +9 -0
src/ui_assets/header.html +23 -0
src/ui_assets/model_ids.txt +5 -0
src/ui_functions.py +285 -0
src/ui_shared.py +24 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,173 @@

+# added
+archive/
+wandb/
+logs
+models
+.git_*
+test*
+video/
+train.py
+deploy.py
+examples/
+notebooks/
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [2023] [1lint]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

README.md ADDED Viewed

	@@ -0,0 +1,76 @@

+---
+title: Style ControlNet
+emoji: ❅
+colorFrom: gray
+colorTo: green
+sdk: gradio
+sdk_version: 3.30.0
+app_file: app.py
+pinned: True
+license: openrail
+---
+# ControlStyle
+Proof of concept for controlling Stable Diffusion image style using a ControlNet.
+| ![](./examples/blue_eyes.gif)  | ![](./examples/blue_eyes.png) |
+| ------------- | ------------- |
+`prompt`: "beautiful woman with blue eyes", `controlnet_prompt`: "1girl, blue eyes"
+| ![](./examples/mountains.gif)  | ![](./examples/mountains.png) |
+| ------------- | ------------- |
+`prompt` and `controlnet_prompt`: "best quality, masterpiece, Dark hair, dark eyes, upper body, sun flare, outdoors, mountain, valley, sky. clouds, smiling"
+`controlnet_conditioning_scale` increments by 0.1 from 0 to 1, left to right.
+## Try Style Controlnet with A1111 WebUI
+![](./examples/zerohint_grid.png)
+![](./examples/hint_grid.png)
+### Quick start: download the anime controlnets [here](https://huggingface.co/lint/anime_control/tree/main),
+Root folder has controlnets in Diffusers format, A1111_weights has controlnets for use with [A1111 Webui Controlnet Extension](https://github.com/Mikubill/sd-webui-controlnet). More details at the [HF repo page](https://huggingface.co/lint/anime_control).
+## Quick Start Training
+For a basic training example with HF Accelerate, run the following
+```
+pip install -r requirements.txt
+python quickstart_train.py
+```
+By default, the script will download pipeline weights and an image dataset from HF Hub.
+The base stable diffusion checkpoint and controlnet weights can either be in HF diffusers format or the original stable diffusion pytorch-lightning format (inferred based on whether destination is file or not)
+Use the `convert_state_dict.sh` to convert the trained controlnet state dict from `diffusers` format to one compatible with the [A1111 controlnet extension](https://github.com/Mikubill/sd-webui-controlnet)
+## Style Controlnet Web UI
+Launch the Web UI locally with
+```
+python app.py
+```
+(My Hf Spaces below are currently out of date, I will fix them soon once I have time)
+Try the WebUI hosted on HF Spaces at https://huggingface.co/spaces/lint/anime_controlnet
+![](./examples/controlstyle_ui.png)
+WebUI also supports basic training
+![](./examples/training_ui.png)
+## ControlNet for Style
+Lvmin introduced the [Controlnet](https://github.com/lllyasviel/ControlNet) to use a cloned Stable Diffusion UNet to introduce external conditioning, such as body poses/sketch lines, to guide Stable Diffusion generation with fantastic results.
+I thought his approach might also work for introducing different styles (i.e. add anime style), in guiding the image generation process. Unlike the original controlnets, I initialized the controlnet weights from a distinct UNet (`andite/anything-v4.5`), and predominantly trained without any controlnet conditioning image on a synthetic anime dataset (`lint/anybooru`) distinct from the base model. Then the main controlnet weights were frozen, the input hint block weights added back in and trained on the same dataset using canny image processing to generate the controlnet conditioning image.
+I originally trained the anime style controlnets without any controlnet conditioning image, so that the controlnet would focus on adding anime style rather than structure to the image. I have these weights saved at https://huggingface.co/lint/anime_styler/tree/main/A1111_webui_weights, however they need to be used with my [fork](https://github.com/1lint/sd-webui-controlnet) of the controlnet extension, which has very minor changes allow the user to load the controlnet without the input hint block weights, and pass None as a valid controlnet "conditioning".
+Recently I added back in the input hint processing module, and trained only the controlnet input hint blocks on canny image generation. So the models in this repository are now just like regular controlnets, except for having a different initialization and training process. They can be used just like a regular controlnet, but the vast majority of the weights were trained on adding anime style, with just the input hint blocks trained on using the controlnet conditioning image. Though it seems to work alright from my limited testing so far, expect the canny image guidance to be weak so combine with original canny image controlnet as needed.
+Since the main controlnet weights were trained without any canny image conditioning, they can (and were intended to be) used without any controlnet conditioning image. However the existing A1111 Controlnet Extension expects the user to always pass a controlnet conditioning image, otherwise it will trigger an error. However you can pass a black square as the "conditioning image", which will add some unexpected random noise to the image due to the input hint block `bias` weights, however the noise is small enough that the controlnet still appears to "work".

app.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from src import demo
+from multiprocessing import cpu_count
+demo.queue(concurrency_count=cpu_count()).launch()

configs/controlnet_config.json ADDED Viewed

	@@ -0,0 +1,41 @@

+{
+  "_class_name": "ControlNetModel",
+  "_diffusers_version": "0.14.0.dev0",
+  "act_fn": "silu",
+  "attention_head_dim": 8,
+  "block_out_channels": [
+    320,
+    640,
+    1280,
+    1280
+  ],
+  "class_embed_type": null,
+  "conditioning_embedding_out_channels": [
+    16,
+    32,
+    96,
+    256
+  ],
+  "controlnet_conditioning_channel_order": "rgb",
+  "cross_attention_dim": 768,
+  "down_block_types": [
+    "CrossAttnDownBlock2D",
+    "CrossAttnDownBlock2D",
+    "CrossAttnDownBlock2D",
+    "DownBlock2D"
+  ],
+  "downsample_padding": 1,
+  "flip_sin_to_cos": true,
+  "freq_shift": 0,
+  "in_channels": 4,
+  "layers_per_block": 2,
+  "mid_block_scale_factor": 1,
+  "norm_eps": 1e-05,
+  "norm_num_groups": 32,
+  "num_class_embeds": null,
+  "only_cross_attention": false,
+  "projection_class_embeddings_input_dim": null,
+  "resnet_time_scale_shift": "default",
+  "upcast_attention": false,
+  "use_linear_projection": false
+}

convert_state_dict.sh ADDED Viewed

	@@ -0,0 +1,8 @@

+# converts controlnet state dict saved in diffusers format to original stable diffusion controlnet format that can be used with the A1111 controlnet extension
+export INPUT_PATH="/home/user/style_controlnet/models/deliberate_v2_animestyler/checkpoint-332228/diffusion_pytorch_model.safetensors"
+export OUTPUT_PATH="models/A1111_weights/anime_styler-deliberate-v0.1.safetensors"
+python src/convert_sd.py --model_path="$INPUT_PATH" --checkpoint_path="$OUTPUT_PATH"  --is_controlnet --half --to_safetensors

main.py ADDED Viewed

	@@ -0,0 +1,49 @@

+from argparse import Namespace
+from multiprocessing import cpu_count
+from src.lab import Lab
+args = Namespace(
+    pretrained_model_name_or_path="lint/liquidfix",
+    controlnet_weights_path="lint/anime_control/anime_merge",
+    #controlnet_weights_path=None, #
+    vae_path="lint/anime_vae",
+    # dataset args
+    train_data_dir="/mnt/g/data/anybooru/train",
+    valid_data_dir="/mnt/g/data/anybooru/valid",
+    resolution=512,
+    from_hf_hub=False,
+    controlnet_hint_key="canny", # set this to "canny" to train with canny hint, or None to pass
+    # training args
+    # options are ["zero convolutions", "input hint blocks"], otherwise trains whole controlnet
+    training_stage = "",
+    learning_rate=5e-6,
+    num_train_epochs=1000,
+    max_train_steps=None,
+    seed=3434554,
+    max_grad_norm=1.0,
+    gradient_accumulation_steps=1,
+    # VRAM args
+    batch_size=1,
+    mixed_precision="fp16", # set to "fp16" for mixed-precision training.
+    gradient_checkpointing=True, # set this to True to lower the memory usage.
+    use_8bit_adam=True, # use 8bit optimizer from bitsandbytes
+    enable_xformers_memory_efficient_attention=True,
+    allow_tf32=True,
+    dataloader_num_workers=cpu_count(),
+    # logging args
+    output_dir="./models",
+    report_to='tensorboard',
+    image_logging_steps=600, # disabled when 0. costs additional VRAM to log images
+    save_whole_pipeline=True,
+    checkpointing_steps=6000,
+)
+if __name__ == '__main__':
+    lab = Lab(args)
+    lab.train(args.num_train_epochs)

quickstart_train.py ADDED Viewed

	@@ -0,0 +1,50 @@

+from argparse import Namespace
+from multiprocessing import cpu_count
+from src.lab import Lab
+# runs on 10GB VRAM GPU (RTX 3080)
+args = Namespace(
+    pretrained_model_name_or_path="lint/liquidfix",
+    controlnet_weights_path="lint/anime_control/anime_merge",
+    #controlnet_weights_path=None, #
+    vae_path="lint/anime_vae",
+    # dataset args
+    train_data_dir="lint/anybooru",
+    valid_data_dir="",
+    resolution=512,
+    from_hf_hub=True,
+    controlnet_hint_key="canny", # set this to "canny" to train with canny hint, or None to pass
+    # training args
+    # options are ["zero convolutions", "input hint blocks"], otherwise trains whole controlnet
+    training_stage = "",
+    learning_rate=5e-6,
+    num_train_epochs=1000,
+    max_train_steps=None,
+    seed=3434554,
+    max_grad_norm=1.0,
+    gradient_accumulation_steps=1,
+    # VRAM args
+    batch_size=1,
+    mixed_precision="fp16", # set to "fp16" for mixed-precision training.
+    gradient_checkpointing=True, # set this to True to lower the memory usage.
+    use_8bit_adam=True, # use 8bit optimizer from bitsandbytes
+    enable_xformers_memory_efficient_attention=True,
+    allow_tf32=True,
+    dataloader_num_workers=cpu_count(),
+    # logging args
+    output_dir="./models",
+    report_to='tensorboard',
+    image_logging_steps=600, # disabled when 0. costs additional VRAM to log images
+    save_whole_pipeline=True,
+    checkpointing_steps=6000,
+)
+if __name__ == '__main__':
+    lab = Lab(args)
+    lab.train(args.num_train_epochs)

requirements.txt ADDED Viewed

	@@ -0,0 +1,18 @@

+accelerate==0.18.0
+datasets>=2.10.0
+diffusers==0.16.1
+gradio>=3.28.3
+huggingface_hub>=0.14.1
+numpy
+packaging
+Pillow
+torch
+torchvision
+tqdm
+transformers>=4.25.1
+omegaconf>=2.2.3
+opencv_contrib_python==4.6.0.66
+safetensors>=0.2.6
+xformers==0.0.17.dev466
+bitsandbytes
+tensorboard>=2.12.0

src/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .app import demo
2	+ from .lab import Lab

src/app.py ADDED Viewed

	@@ -0,0 +1,260 @@

+import gradio as gr
+from multiprocessing import cpu_count
+from src.ui_shared import (
+    model_ids,
+    scheduler_names,
+    default_scheduler,
+    controlnet_ids,
+    assets_directory,
+)
+from src.ui_functions import generate, run_training
+default_img_size = 512
+with open(f"{assets_directory}/header.html") as fp:
+    header = fp.read()
+with open(f"{assets_directory}/footer.html") as fp:
+    footer = fp.read()
+theme = gr.themes.Soft(
+    primary_hue="blue",
+    neutral_hue="slate",
+)
+from gradio.themes.builder_app import css
+with gr.Blocks(theme=theme) as demo:
+    gr.HTML(header)
+    with gr.Row():
+        with gr.Column(scale=70):
+            prompt = gr.Textbox(
+                label="Prompt", placeholder="Press <Shift+Enter> to generate", lines=2
+            )
+            neg_prompt = gr.Textbox(label="Negative Prompt", placeholder="", lines=2)
+            with gr.Row():
+                controlnet_prompt = gr.Textbox(
+                    label="Controlnet Prompt",
+                    placeholder="If empty, defaults to base `Prompt`",
+                    lines=2,
+                )
+                controlnet_negative_prompt = gr.Textbox(
+                    label="Controlnet Negative Prompt",
+                    placeholder="If empty, defaults to base `Negative Prompt`",
+                    lines=2,
+                )
+        with gr.Column(scale=30):
+            model_name = gr.Dropdown(
+                label="Model", choices=model_ids, value=model_ids[0]
+            )
+            controlnet_name = gr.Dropdown(
+                label="Controlnet", choices=controlnet_ids, value=controlnet_ids[0]
+            )
+            scheduler_name = gr.Dropdown(
+                label="Scheduler", choices=scheduler_names, value=default_scheduler
+            )
+            generate_button = gr.Button(value="Generate", variant="primary")
+    with gr.Row():
+        with gr.Column():
+            with gr.Tab("Inference") as tab:
+                guidance_image = gr.Image(
+                    label="Guidance Image",
+                    source="upload",
+                    tool="editor",
+                    type="pil",
+                ).style(height=256)
+                with gr.Row():
+                    controlnet_cond_scale = gr.Slider(
+                        label="Controlnet Weight",
+                        value=0.5,
+                        minimum=0.0,
+                        maximum=1.0,
+                        step=0.1,
+                    )
+                with gr.Row():
+                    batch_size = gr.Slider(
+                        label="Batch Size", value=1, minimum=1, maximum=8, step=1
+                    )
+                    seed = gr.Slider(-1, 2147483647, label="Seed", value=-1, step=1)
+                with gr.Row():
+                    guidance = gr.Slider(
+                        label="Guidance scale", value=7.5, minimum=0, maximum=20
+                    )
+                    steps = gr.Slider(
+                        label="Steps", value=20, minimum=1, maximum=100, step=1
+                    )
+                with gr.Row():
+                    width = gr.Slider(
+                        label="Width",
+                        value=default_img_size,
+                        minimum=64,
+                        maximum=1024,
+                        step=32,
+                    )
+                    height = gr.Slider(
+                        label="Height",
+                        value=default_img_size,
+                        minimum=64,
+                        maximum=1024,
+                        step=32,
+                    )
+            with gr.Tab("Train Style ControlNet") as tab:
+                with gr.Row():
+                    train_batch_size = gr.Slider(
+                        label="Training Batch Size",
+                        minimum=1,
+                        maximum=8,
+                        step=1,
+                        value=1,
+                    )
+                    gradient_accumulation_steps = gr.Slider(
+                        label="Gradient Accumulation steps",
+                        minimum=1,
+                        maximum=6,
+                        step=1,
+                        value=4,
+                    )
+                with gr.Row():
+                    max_train_steps = gr.Number(
+                        label="Total training steps", value=16000
+                    )
+                    train_learning_rate = gr.Number(label="Learning Rate", value=5.0e-6)
+                with gr.Row():
+                    checkpointing_steps = gr.Number(
+                        label="Steps between saving checkpoints", value=4000
+                    )
+                    image_logging_steps = gr.Number(
+                        label="Steps between logging example images (pass 0 to disable)",
+                        value=0,
+                    )
+                with gr.Row():
+                    train_data_dir = gr.Textbox(
+                        label=f"Path to training image folder",
+                        value="lint/anybooru",
+                    )
+                    valid_data_dir = gr.Textbox(
+                        label=f"Path to validation image folder",
+                        value="",
+                    )
+                with gr.Row():
+                    controlnet_weights_path = gr.Textbox(
+                        label=f"Repo for initializing Controlnet Weights",
+                        value="andite/anything-v4.0/unet",
+                    )
+                    output_dir = gr.Textbox(
+                        label=f"Output directory for trained weights", value="./models"
+                    )
+                with gr.Row():
+                    train_whole_controlnet = gr.Checkbox(
+                        label="Train whole controlnet", value=True
+                    )
+                    save_whole_pipeline = gr.Checkbox(
+                        label="Save whole pipeline", value=True
+                    )
+                training_button = gr.Button(
+                    value="Train Style ControlNet", variant="primary"
+                )
+                training_status = gr.Text(label="Training Status")
+        with gr.Column():
+            gallery = gr.Gallery(
+                label="Generated images", show_label=False, elem_id="gallery"
+            ).style(height=default_img_size, grid=2)
+            generation_details = gr.Markdown()
+            # pipe_kwargs = gr.Textbox(label="Pipe kwargs", value="{\n\t\n}", visible=False)
+            # if torch.cuda.is_available():
+            #  giga = 2**30
+            #  vram_guage = gr.Slider(0, torch.cuda.memory_reserved(0)/giga, label='VRAM Allocated to Reserved (GB)', value=0, step=1)
+            #  demo.load(lambda : torch.cuda.memory_allocated(0)/giga, inputs=[], outputs=vram_guage, every=0.5, show_progress=False)
+    # gr.HTML(footer)
+    inputs = [
+        model_name,
+        guidance_image,
+        controlnet_name,
+        scheduler_name,
+        prompt,
+        guidance,
+        steps,
+        batch_size,
+        width,
+        height,
+        seed,
+        neg_prompt,
+        controlnet_prompt,
+        controlnet_negative_prompt,
+        controlnet_cond_scale,
+        # pipe_kwargs,
+    ]
+    outputs = [gallery, generation_details]
+    prompt.submit(generate, inputs=inputs, outputs=outputs)
+    generate_button.click(generate, inputs=inputs, outputs=outputs)
+    training_inputs = [
+        model_name,
+        controlnet_weights_path,
+        train_data_dir,
+        valid_data_dir,
+        train_batch_size,
+        train_whole_controlnet,
+        gradient_accumulation_steps,
+        max_train_steps,
+        train_learning_rate,
+        output_dir,
+        checkpointing_steps,
+        image_logging_steps,
+        save_whole_pipeline,
+    ]
+    training_button.click(
+        run_training,
+        inputs=training_inputs,
+        outputs=[training_status],
+    )
+    # from gradio.themes.builder_app
+    demo.load(
+        None,
+        None,
+        None,
+        _js="""() => {
+        if (document.querySelectorAll('.dark').length) {
+            document.querySelectorAll('.dark').forEach(el => el.classList.remove('dark'));
+        } else {
+            document.querySelector('body').classList.add('dark');
+        }
+    }""",
+    )
+if __name__ == "__main__":
+    demo.queue(concurrency_count=cpu_count()).launch()

src/controlnet_pipe.py ADDED Viewed

	@@ -0,0 +1,309 @@

+from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_controlnet import *
+class ControlNetPipe(StableDiffusionControlNetPipeline):
+    # copied from superclass and modified to accept controlnet prompt independent of base prompt
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        image: Union[torch.FloatTensor, PIL.Image.Image, List[torch.FloatTensor], List[PIL.Image.Image]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        controlnet_conditioning_scale: Union[float, List[float]] = 1.0,
+        guess_mode: bool = False,
+        controlnet_prompt_embeds = None,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            image (`torch.FloatTensor`, `PIL.Image.Image`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`,
+                    `List[List[torch.FloatTensor]]`, or `List[List[PIL.Image.Image]]`):
+                The ControlNet input condition. ControlNet uses this input condition to generate guidance to Unet. If
+                the type is specified as `Torch.FloatTensor`, it is passed to ControlNet as is. `PIL.Image.Image` can
+                also be accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If
+                height and/or width are passed, `image` is resized according to them. If multiple ControlNets are
+                specified in init, images must be passed as a list such that each element of the list can be correctly
+                batched for input to a single controlnet.
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
+            controlnet_conditioning_scale (`float` or `List[float]`, *optional*, defaults to 1.0):
+                The outputs of the controlnet are multiplied by `controlnet_conditioning_scale` before they are added
+                to the residual in the original unet. If multiple ControlNets are specified in init, you can set the
+                corresponding scale as a list.
+            guess_mode (`bool`, *optional*, defaults to `False`):
+                In this mode, the ControlNet encoder will try best to recognize the content of the input image even if
+                you remove all prompts. The `guidance_scale` between 3.0 and 5.0 is recommended.
+        Examples:
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+        # 0. Default height and width to unet
+        height, width = self._default_height_width(height, width, image)
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            image,
+            height,
+            width,
+            callback_steps,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+            controlnet_conditioning_scale,
+        )
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+        if isinstance(self.controlnet, MultiControlNetModel) and isinstance(controlnet_conditioning_scale, float):
+            controlnet_conditioning_scale = [controlnet_conditioning_scale] * len(self.controlnet.nets)
+        # 3. Encode input prompt
+        prompt_embeds = self._encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+        )
+        # 4. Prepare image
+        if isinstance(self.controlnet, ControlNetModel):
+            image = self.prepare_image(
+                image=image,
+                width=width,
+                height=height,
+                batch_size=batch_size * num_images_per_prompt,
+                num_images_per_prompt=num_images_per_prompt,
+                device=device,
+                dtype=self.controlnet.dtype,
+                do_classifier_free_guidance=do_classifier_free_guidance,
+                guess_mode=guess_mode,
+            )
+        elif isinstance(self.controlnet, MultiControlNetModel):
+            images = []
+            for image_ in image:
+                image_ = self.prepare_image(
+                    image=image_,
+                    width=width,
+                    height=height,
+                    batch_size=batch_size * num_images_per_prompt,
+                    num_images_per_prompt=num_images_per_prompt,
+                    device=device,
+                    dtype=self.controlnet.dtype,
+                    do_classifier_free_guidance=do_classifier_free_guidance,
+                    guess_mode=guess_mode,
+                )
+                images.append(image_)
+            image = images
+        else:
+            assert False
+        # 5. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+        # 6. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+        # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+        if not controlnet_prompt_embeds:
+            controlnet_prompt_embeds = prompt_embeds
+        # 8. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                # controlnet(s) inference
+                if guess_mode and do_classifier_free_guidance:
+                    # Infer ControlNet only for the conditional batch.
+                    controlnet_latent_model_input = latents
+                    controlnet_prompt_embeds = controlnet_prompt_embeds.chunk(2)[1]
+                else:
+                    controlnet_latent_model_input = latent_model_input
+                    controlnet_prompt_embeds = controlnet_prompt_embeds
+                down_block_res_samples, mid_block_res_sample = self.controlnet(
+                    controlnet_latent_model_input,
+                    t,
+                    encoder_hidden_states=controlnet_prompt_embeds,
+                    controlnet_cond=image,
+                    conditioning_scale=controlnet_conditioning_scale,
+                    guess_mode=guess_mode,
+                    return_dict=False,
+                )
+                if guess_mode and do_classifier_free_guidance:
+                    # Infered ControlNet only for the conditional batch.
+                    # To apply the output of ControlNet to both the unconditional and conditional batches,
+                    # add 0 to the unconditional batch to keep it unchanged.
+                    down_block_res_samples = [torch.cat([torch.zeros_like(d), d]) for d in down_block_res_samples]
+                    mid_block_res_sample = torch.cat([torch.zeros_like(mid_block_res_sample), mid_block_res_sample])
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    down_block_additional_residuals=down_block_res_samples,
+                    mid_block_additional_residual=mid_block_res_sample,
+                ).sample
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        callback(i, t, latents)
+        # If we do sequential model offloading, let's offload unet and controlnet
+        # manually for max memory savings
+        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+            self.unet.to("cpu")
+            self.controlnet.to("cpu")
+            torch.cuda.empty_cache()
+        if output_type == "latent":
+            image = latents
+            has_nsfw_concept = None
+        elif output_type == "pil":
+            # 8. Post-processing
+            image = self.decode_latents(latents)
+            # 9. Run safety checker
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+            # 10. Convert to PIL
+            image = self.numpy_to_pil(image)
+        else:
+            # 8. Post-processing
+            image = self.decode_latents(latents)
+            # 9. Run safety checker
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+        # Offload last model to CPU
+        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+            self.final_offload_hook.offload()
+        if not return_dict:
+            return (image, has_nsfw_concept)
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)

src/convert_sd.py ADDED Viewed

	@@ -0,0 +1,223 @@

+# Script for converting a HF Diffusers saved pipeline to a Stable Diffusion checkpoint.
+# *Only* converts the UNet, VAE, and Text Encoder.
+# Does not convert optimizer state or any other thing.
+# Originally written by jachiam at https://gist.github.com/jachiam/8a5c0b607e38fcc585168b90c686eb05
+# modified by 1lint to support controlnet conversion
+import argparse
+import torch
+from safetensors import safe_open
+from safetensors.torch import save_file
+from pathlib import Path
+# =================#
+# UNet Conversion #
+# =================#
+unet_conversion_map = [
+    # (stable-diffusion, HF Diffusers)
+    ("time_embed.0.weight", "time_embedding.linear_1.weight"),
+    ("time_embed.0.bias", "time_embedding.linear_1.bias"),
+    ("time_embed.2.weight", "time_embedding.linear_2.weight"),
+    ("time_embed.2.bias", "time_embedding.linear_2.bias"),
+    ("input_blocks.0.0.weight", "conv_in.weight"),
+    ("input_blocks.0.0.bias", "conv_in.bias"),
+    ("out.0.weight", "conv_norm_out.weight"),
+    ("out.0.bias", "conv_norm_out.bias"),
+    ("out.2.weight", "conv_out.weight"),
+    ("out.2.bias", "conv_out.bias"),
+]
+unet_conversion_map_resnet = [
+    # (stable-diffusion, HF Diffusers)
+    ("in_layers.0", "norm1"),
+    ("in_layers.2", "conv1"),
+    ("out_layers.0", "norm2"),
+    ("out_layers.3", "conv2"),
+    ("emb_layers.1", "time_emb_proj"),
+    ("skip_connection", "conv_shortcut"),
+]
+unet_conversion_map_layer = []
+# hardcoded number of downblocks and resnets/attentions...
+# would need smarter logic for other networks.
+for i in range(4):
+    # loop over downblocks/upblocks
+    for j in range(2):
+        # loop over resnets/attentions for downblocks
+        hf_down_res_prefix = f"down_blocks.{i}.resnets.{j}."
+        sd_down_res_prefix = f"input_blocks.{3*i + j + 1}.0."
+        unet_conversion_map_layer.append((sd_down_res_prefix, hf_down_res_prefix))
+        if i < 3:
+            # no attention layers in down_blocks.3
+            hf_down_atn_prefix = f"down_blocks.{i}.attentions.{j}."
+            sd_down_atn_prefix = f"input_blocks.{3*i + j + 1}.1."
+            unet_conversion_map_layer.append((sd_down_atn_prefix, hf_down_atn_prefix))
+    for j in range(3):
+        # loop over resnets/attentions for upblocks
+        hf_up_res_prefix = f"up_blocks.{i}.resnets.{j}."
+        sd_up_res_prefix = f"output_blocks.{3*i + j}.0."
+        unet_conversion_map_layer.append((sd_up_res_prefix, hf_up_res_prefix))
+        if i > 0:
+            # no attention layers in up_blocks.0
+            hf_up_atn_prefix = f"up_blocks.{i}.attentions.{j}."
+            sd_up_atn_prefix = f"output_blocks.{3*i + j}.1."
+            unet_conversion_map_layer.append((sd_up_atn_prefix, hf_up_atn_prefix))
+    if i < 3:
+        # no downsample in down_blocks.3
+        hf_downsample_prefix = f"down_blocks.{i}.downsamplers.0.conv."
+        sd_downsample_prefix = f"input_blocks.{3*(i+1)}.0.op."
+        unet_conversion_map_layer.append((sd_downsample_prefix, hf_downsample_prefix))
+        # no upsample in up_blocks.3
+        hf_upsample_prefix = f"up_blocks.{i}.upsamplers.0."
+        sd_upsample_prefix = f"output_blocks.{3*i + 2}.{1 if i == 0 else 2}."
+        unet_conversion_map_layer.append((sd_upsample_prefix, hf_upsample_prefix))
+hf_mid_atn_prefix = "mid_block.attentions.0."
+sd_mid_atn_prefix = "middle_block.1."
+unet_conversion_map_layer.append((sd_mid_atn_prefix, hf_mid_atn_prefix))
+for j in range(2):
+    hf_mid_res_prefix = f"mid_block.resnets.{j}."
+    sd_mid_res_prefix = f"middle_block.{2*j}."
+    unet_conversion_map_layer.append((sd_mid_res_prefix, hf_mid_res_prefix))
+def convert_unet_state_dict(unet_state_dict, is_controlnet=True):
+    # buyer beware: this is a *brittle* function,
+    # and correct output requires that all of these pieces interact in
+    # the exact order in which I have arranged them.
+    mapping = {k: k for k in unet_state_dict.keys()}
+    conversion_map = unet_conversion_map
+    if is_controlnet:
+        # remove output blocks from conversion mapping since controlnet doesn't have them
+        conversion_map = unet_conversion_map[:6]
+        for k, v in mapping.items():
+            # convert controlnet zero convolution keys
+            if "controlnet_down_blocks" in v:
+                new_key = v.replace("controlnet_down_blocks", "zero_convs")
+                new_key = ".0.".join(new_key.rsplit(".", 1))
+                mapping[k] = new_key
+        mapping["controlnet_mid_block.bias"] = "middle_block_out.0.bias"
+        mapping["controlnet_mid_block.weight"] = "middle_block_out.0.weight"
+        if "controlnet_cond_embedding.conv_in.weight" in mapping:
+            mapping[
+                "controlnet_cond_embedding.conv_in.weight"
+            ] = "input_hint_block.0.weight"
+            mapping[
+                "controlnet_cond_embedding.conv_in.bias"
+            ] = "input_hint_block.0.bias"
+            for i in range(6):
+                mapping[
+                    f"controlnet_cond_embedding.blocks.{i}.weight"
+                ] = f"input_hint_block.{2*(i+1)}.weight"
+                mapping[
+                    f"controlnet_cond_embedding.blocks.{i}.bias"
+                ] = f"input_hint_block.{2*(i+1)}.bias"
+            mapping[
+                "controlnet_cond_embedding.conv_out.weight"
+            ] = "input_hint_block.14.weight"
+            mapping[
+                "controlnet_cond_embedding.conv_out.bias"
+            ] = "input_hint_block.14.bias"
+    for sd_name, hf_name in conversion_map:
+        mapping[hf_name] = sd_name
+    for k, v in mapping.items():
+        if "resnets" in k:
+            for sd_part, hf_part in unet_conversion_map_resnet:
+                v = v.replace(hf_part, sd_part)
+            mapping[k] = v
+    for k, v in mapping.items():
+        for sd_part, hf_part in unet_conversion_map_layer:
+            v = v.replace(hf_part, sd_part)
+        mapping[k] = v
+    new_state_dict = {v: unet_state_dict[k] for k, v in mapping.items()}
+    return new_state_dict
+def load_state_dict(state_dict_path):
+    file_ext = state_dict_path.rsplit(".", 1)[-1]
+    if file_ext == "safetensors":
+        state_dict = {}
+        with safe_open(state_dict_path, framework="pt", device="cpu") as f:
+            for key in f.keys():
+                state_dict[key] = f.get_tensor(key)
+    else:
+        state_dict = torch.load(state_dict_path, map_location="cpu")
+    return state_dict
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model_path",
+        default=None,
+        type=str,
+        required=True,
+        help="Path to the model to convert.",
+    )
+    parser.add_argument(
+        "--checkpoint_path",
+        default=None,
+        type=str,
+        required=True,
+        help="Path to the output model.",
+    )
+    parser.add_argument(
+        "--half", action="store_true", help="Save weights in half precision."
+    )
+    parser.add_argument(
+        "--is_controlnet",
+        action="store_true",
+        help="Whether conversion is for controlnet or standard sd unet",
+    )
+    parser.add_argument(
+        "--to_safetensors",
+        action="store_true",
+        help="Whether to save state dict in safetensors format",
+    )
+    args = parser.parse_args()
+    assert args.model_path is not None, "Must provide a model path!"
+    assert args.checkpoint_path is not None, "Must provide a checkpoint path!"
+    unet_state_dict = load_state_dict(args.model_path)
+    # Convert the UNet model
+    unet_state_dict = convert_unet_state_dict(
+        unet_state_dict, is_controlnet=args.is_controlnet
+    )
+    if args.half:
+        unet_state_dict = {k: v.half() for k, v in unet_state_dict.items()}
+    Path(args.checkpoint_path).parent.mkdir(parents=True, exist_ok=True)
+    if args.to_safetensors:
+        save_file(unet_state_dict, args.checkpoint_path)
+    else:
+        torch.save(unet_state_dict, args.checkpoint_path)
+    print(
+        f"Converted {Path(args.model_path)} to original SD format at {Path(args.checkpoint_path)}"
+    )

src/data.py ADDED Viewed

	@@ -0,0 +1,149 @@

+from pathlib import Path
+from PIL import Image
+import torchvision
+import random
+from torch.utils.data import Dataset, DataLoader
+from functools import partial
+from multiprocessing import cpu_count
+from datasets import load_dataset
+import cv2
+import numpy as np
+import torch
+class PNGDataset(Dataset):
+    def __init__(
+        self,
+        data_dir,
+        tokenizer,
+        from_hf_hub=False,
+        ucg=0.10,
+        resolution=(512, 512),
+        prompt_key="tags",
+        cond_key="cond",
+        target_key="image",
+        controlnet_hint_key=None,
+        file_extension="png",
+    ):
+        super().__init__()
+        vars(self).update(locals())
+        if from_hf_hub:
+            self.img_paths = load_dataset(data_dir)["train"]
+        else:
+            self.img_paths = list(Path(data_dir).glob(f"*.{file_extension}"))
+        self.ucg = ucg
+        self.flip_transform = torchvision.transforms.RandomHorizontalFlip(p=0.5)
+        self.transforms = torchvision.transforms.Compose(
+            [
+                torchvision.transforms.Resize(resolution),
+                torchvision.transforms.ToTensor(),
+            ]
+        )
+        self.normalize = torchvision.transforms.Normalize([0.5], [0.5])
+    def process_canny(self, image):
+        # code from https://huggingface.co/docs/diffusers/main/en/api/pipelines/stable_diffusion/controlnet
+        image = np.array(image)
+        low_threshold, high_threshold = (100, 200)
+        image = cv2.Canny(image, low_threshold, high_threshold)
+        image = image[:, :, None]
+        image = np.concatenate([image, image, image], axis=2)
+        canny_image = Image.fromarray(image)
+        return canny_image
+    def __len__(self):
+        return len(self.img_paths)
+    def __getitem__(self, idx):
+        if self.from_hf_hub:
+            image = self.img_paths[idx]["image"]
+        else:
+            image = Image.open(self.img_paths[idx])
+        if self.prompt_key not in image.info:
+            print(f"Image {idx} lacks {self.prompt_key}, skipping to next image")
+            return self.__getitem__(idx + 1 % len(self))
+        if random.random() < self.ucg:
+            tags = ""
+        else:
+            tags = image.info[self.prompt_key]
+        # randomly flip image here so input image to canny has matching flip
+        image = self.flip_transform(image)
+        target = self.normalize(self.transforms(image))
+        output_dict = {self.target_key: target, self.cond_key: tags}
+        if self.controlnet_hint_key == "canny":
+            canny_image = self.transforms(self.process_canny(image))
+            output_dict[self.controlnet_hint_key] = canny_image
+        return output_dict
+    def collate_fn(self, samples):
+        prompts = torch.tensor(
+            [
+                self.tokenizer(
+                    sample[self.cond_key],
+                    padding="max_length",
+                    truncation=True,
+                ).input_ids
+                for sample in samples
+            ]
+        )
+        images = torch.stack(
+            [sample[self.target_key] for sample in samples]
+        ).contiguous()
+        batch = {
+            self.cond_key: prompts,
+            self.target_key: images,
+        }
+        if self.controlnet_hint_key is not None:
+            hint = torch.stack(
+                [sample[self.controlnet_hint_key] for sample in samples]
+            ).contiguous()
+            batch[self.controlnet_hint_key] = hint
+        return batch
+class PNGDataModule:
+    def __init__(
+        self,
+        batch_size=1,
+        num_workers=None,
+        persistent_workers=True,
+        **kwargs,  # passed to dataset class
+    ):
+        super().__init__()
+        vars(self).update(locals())
+        if num_workers is None:
+            num_workers = cpu_count() // 2
+        self.ds_wrapper = partial(PNGDataset, **kwargs)
+        self.dl_wrapper = partial(
+            DataLoader,
+            batch_size=batch_size,
+            num_workers=num_workers,
+            persistent_workers=persistent_workers,
+        )
+    def get_dataloader(self, data_dir, shuffle=False):
+        dataset = self.ds_wrapper(data_dir=data_dir)
+        dataloader = self.dl_wrapper(
+            dataset, shuffle=shuffle, collate_fn=dataset.collate_fn
+        )
+        return dataloader

src/lab.py ADDED Viewed

	@@ -0,0 +1,474 @@

+# modified starting from HuggingFace diffusers train_dreambooth.py example
+# https://github.com/huggingface/diffusers/blob/024c4376fb19caa85275c038f071b6e1446a5cad/examples/dreambooth/train_dreambooth.py
+import os
+from pathlib import Path
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import ProjectConfiguration, set_seed
+from PIL import Image
+from tqdm.auto import tqdm
+from diffusers import AutoencoderKL, StableDiffusionPipeline
+from torchvision.utils import make_grid
+import numpy as np
+from diffusers.pipelines.stable_diffusion.convert_from_ckpt import (
+    download_from_original_stable_diffusion_ckpt,
+)
+from diffusers import StableDiffusionControlNetPipeline, ControlNetModel
+from diffusers.schedulers import UniPCMultistepScheduler
+from .data import PNGDataModule
+logger = get_logger(__name__)
+class Lab(Accelerator):
+    def __init__(self, args, control_pipe=None):
+        self.cond_key = "prompts"
+        self.target_key = "images"
+        self.args = args
+        self.output_dir = Path(args.output_dir)
+        logging_dir = str(self.output_dir / "logs")
+        accelerator_project_config = ProjectConfiguration(
+            logging_dir=logging_dir,
+        )
+        super().__init__(
+            mixed_precision=args.mixed_precision,
+            log_with=args.report_to,
+            project_config=accelerator_project_config,
+        )
+        if self.mixed_precision == "fp16":
+            self.weight_dtype = torch.float16
+        elif self.mixed_precision == "bf16":
+            self.weight_dtype = torch.bfloat16
+        else:
+            self.weight_dtype = torch.float32
+        if args.seed is not None:
+            set_seed(args.seed)
+        if control_pipe is None:
+            control_pipe = self.load_pipe(
+                args.pretrained_model_name_or_path, args.controlnet_weights_path
+            )
+        self.control_pipe = control_pipe
+        vae = control_pipe.vae
+        unet = control_pipe.unet
+        text_encoder = control_pipe.text_encoder
+        tokenizer = control_pipe.tokenizer
+        controlnet = (
+            control_pipe.controlnet if hasattr(control_pipe, "controlnet") else None
+        )
+        self.noise_scheduler = UniPCMultistepScheduler.from_config(control_pipe.scheduler.config)
+        vae.requires_grad_(False)
+        text_encoder.requires_grad_(False)
+        if controlnet:
+            unet.requires_grad_(False)
+            if args.training_stage == "zero convolutions":
+                controlnet.requires_grad_(False)
+                controlnet.controlnet_down_blocks.requires_grad_(True)
+                controlnet.controlnet_mid_block.requires_grad_(True)
+                # optimize only the zero convolution weights
+                params_to_optimize = list(
+                    controlnet.controlnet_down_blocks.parameters()
+                ) + list(controlnet.controlnet_mid_block.parameters())
+            elif args.training_stage == "input hint blocks":
+                controlnet.requires_grad_(False)
+                controlnet.controlnet_cond_embedding.requires_grad_(True)
+                params_to_optimize = list(
+                    controlnet.controlnet_cond_embedding.parameters()
+                )
+            else:
+                controlnet.requires_grad_(True)
+                params_to_optimize = list(controlnet.parameters())
+        else:
+            unet.requires_grad_(True)
+            params_to_optimize = list(unet.parameters())
+        self.params_to_optimize = params_to_optimize
+        args.learning_rate = (
+            args.learning_rate
+            * args.gradient_accumulation_steps
+            * args.batch_size
+            * self.num_processes
+        )
+        if args.use_8bit_adam:
+            import bitsandbytes as bnb
+            optimizer_class = bnb.optim.AdamW8bit
+        else:
+            optimizer_class = torch.optim.AdamW
+        self.optimizer = self.prepare(
+            optimizer_class(
+                params_to_optimize,
+                lr=args.learning_rate,
+            )
+        )
+        if args.enable_xformers_memory_efficient_attention:
+            unet.enable_xformers_memory_efficient_attention()
+            if controlnet:
+                controlnet.enable_xformers_memory_efficient_attention()
+        if args.gradient_checkpointing:
+            unet.enable_gradient_checkpointing()
+            if controlnet:
+                controlnet.enable_gradient_checkpointing()
+        torch.backends.cuda.matmul.allow_tf32 = True
+        datamodule = PNGDataModule(
+            tokenizer=tokenizer,
+            from_hf_hub=args.from_hf_hub,
+            resolution=[args.resolution, args.resolution],
+            target_key=self.target_key,
+            cond_key=self.cond_key,
+            persistent_workers=True,
+            num_workers=args.dataloader_num_workers,
+            batch_size=args.batch_size,
+            controlnet_hint_key=None if controlnet is None else args.controlnet_hint_key,
+        )
+        self.train_dataloader = self.prepare(
+            datamodule.get_dataloader(args.train_data_dir, shuffle=True)
+        )
+        if args.valid_data_dir:
+            self.valid_dataloader = self.prepare(
+                datamodule.get_dataloader(args.valid_data_dir)
+            )
+        self.vae = vae.to(self.device, dtype=self.weight_dtype)
+        self.text_encoder = text_encoder.to(self.device, dtype=self.weight_dtype)
+        if controlnet:
+            controlnet = self.prepare(controlnet)
+            self.controlnet = controlnet.to(self.device, dtype=torch.float32)
+            self.unet = unet.to(self.device, dtype=self.weight_dtype)
+        else:
+            unet = self.prepare(unet)
+            self.unet = unet.to(self.device, dtype=torch.float32)
+            self.controlnet = None
+    def load_pipe(self, sd_model_path, controlnet_path=None):
+        if self.args.vae_path:
+            vae = AutoencoderKL.from_pretrained(
+                self.args.vae_path, torch_dtype=self.weight_dtype
+            )
+        if os.path.isfile(sd_model_path):
+            file_ext = sd_model_path.rsplit(".", 1)[-1]
+            from_safetensors = file_ext == "safetensors"
+            pipe = download_from_original_stable_diffusion_ckpt(
+                sd_model_path,
+                from_safetensors=from_safetensors,
+                device="cpu",
+                load_safety_checker=False,
+            )
+            pipe.safety_checker = None
+            pipe.feature_extractor = None
+            if self.args.vae_path:
+                pipe.vae = vae
+        else:
+            if self.args.vae_path:
+                kw_args = dict(vae=vae)
+            else:
+                kw_args = dict()
+            pipe = StableDiffusionPipeline.from_pretrained(
+                sd_model_path,
+                safety_checker=None,
+                feature_extractor=None,
+                requires_safety_checker=False,
+                torch_dtype=self.weight_dtype,
+                **kw_args
+            )
+        if not controlnet_path:
+            return pipe
+        pathobj = Path(controlnet_path)
+        if pathobj.is_file():
+            controlnet = ControlNetModel.from_config(
+                ControlNetModel.load_config("configs/controlnet_config.json")
+            )
+            controlnet.load_weights_from_sd_ckpt(controlnet_path)
+        else:
+            controlnet_path = str(Path().joinpath(*pathobj.parts[:-1]))
+            subfolder = str(pathobj.parts[-1])
+            controlnet = ControlNetModel.from_pretrained(
+                controlnet_path,
+                subfolder=subfolder,
+                low_cpu_mem_usage=False,
+                device_map=None,
+            )
+        return StableDiffusionControlNetPipeline(
+            **pipe.components,
+            controlnet=controlnet,
+            requires_safety_checker=False,
+        )
+    @torch.autocast("cuda")
+    def compute_loss(self, batch):
+        images = batch[self.target_key].to(dtype=self.weight_dtype)
+        latents = self.vae.encode(images).latent_dist.sample()
+        latents = latents * self.vae.config.scaling_factor
+        # Sample noise that we'll add to the latents
+        noise = torch.randn_like(latents)
+        # Sample a random timestep for each image
+        timesteps = torch.randint(
+            0,
+            self.noise_scheduler.config.num_train_timesteps,
+            (latents.shape[0],),
+            device=latents.device,
+        )
+        timesteps = timesteps.long()
+        # Add noise to the latents according to the noise magnitude at each timestep
+        # (this is the forward diffusion process)
+        noisy_latents = self.noise_scheduler.add_noise(latents, noise, timesteps)
+        # Get the text embedding for conditioning
+        encoder_hidden_states = self.text_encoder(batch[self.cond_key])[0]
+        if self.controlnet:
+            if self.args.controlnet_hint_key in batch:
+                controlnet_hint = batch[self.args.controlnet_hint_key].to(
+                    dtype=self.weight_dtype
+                )
+            else:
+                controlnet_hint = torch.zeros(images.shape).to(images)
+            down_block_res_samples, mid_block_res_sample = self.controlnet(
+                noisy_latents,
+                timesteps,
+                encoder_hidden_states=encoder_hidden_states,
+                controlnet_cond=controlnet_hint,
+                return_dict=False,
+            )
+        else:
+            down_block_res_samples, mid_block_res_sample = None, None
+        noise_pred = self.unet(
+            noisy_latents,
+            timesteps,
+            encoder_hidden_states=encoder_hidden_states,
+            down_block_additional_residuals=down_block_res_samples,
+            mid_block_additional_residual=mid_block_res_sample,
+        ).sample
+        # Get the target for loss depending on the prediction type
+        if self.noise_scheduler.config.prediction_type == "epsilon":
+            target = noise
+        elif self.noise_scheduler.config.prediction_type == "v_prediction":
+            target = self.noise_scheduler.get_velocity(latents, noise, timesteps)
+        else:
+            raise ValueError(
+                f"Unknown prediction type {self.noise_scheduler.config.prediction_type}"
+            )
+        loss = F.mse_loss(noise_pred, target, reduction="mean")
+        return loss, encoder_hidden_states
+    def decode_latents(self, latents):
+        latents = 1 / self.vae.config.scaling_factor * latents
+        output_latents = self.vae.decode(latents).sample
+        output_latents = (output_latents / 2 + 0.5).clamp(0, 1)
+        return output_latents
+    @torch.no_grad()
+    @torch.autocast("cuda")
+    def log_images(self, batch, encoder_hidden_states, cond_scales=[0.0, 0.5, 1.0]):
+        input_tensors = batch[self.target_key].to(self.weight_dtype)
+        input_tensors = (input_tensors / 2 + 0.5).clamp(0, 1)
+        tensors_to_log = [input_tensors.cpu()]
+        [height, width] = input_tensors.shape[-2:]
+        if self.controlnet:
+            if self.args.controlnet_hint_key in batch:
+                controlnet_hint = batch[self.args.controlnet_hint_key].to(
+                    self.weight_dtype
+                )
+            else:
+                controlnet_hint = None
+            for cond_scale in cond_scales:
+                latents = self.control_pipe(
+                    image=controlnet_hint,
+                    prompt_embeds=encoder_hidden_states,
+                    controlnet_conditioning_scale=cond_scale,
+                    height=height,
+                    width=width,
+                    output_type="latent",
+                    num_inference_steps=25,
+                )[0]
+                tensors_to_log.append(self.decode_latents(latents).detach().cpu())
+            if controlnet_hint is not None:
+                tensors_to_log.append(controlnet_hint.detach().cpu())
+        else:
+            latents = self.control_pipe(
+                prompt_embeds=encoder_hidden_states,
+                height=height,
+                width=width,
+                output_type="latent",
+                num_inference_steps=25,
+            )[0]
+            tensors_to_log.append(self.decode_latents(latents).detach().cpu())
+        image_tensors = torch.cat(tensors_to_log)
+        grid = make_grid(image_tensors, normalize=False, nrow=input_tensors.shape[0])
+        grid = grid.permute(1, 2, 0).squeeze(-1) * 255
+        grid = grid.numpy().astype(np.uint8)
+        image_grid = Image.fromarray(grid)
+        image_grid.save(Path(self.trackers[0].logging_dir) / f"{self.global_step}.png")
+    def save_weights(self, to_safetensors=True):
+        save_dir = self.output_dir / f"checkpoint-{self.global_step}"
+        os.makedirs(save_dir, exist_ok=True)
+        if self.args.save_whole_pipeline:
+            self.control_pipe.save_pretrained(
+                str(save_dir), safe_serialization=to_safetensors
+            )
+        elif self.controlnet:
+            self.controlnet.save_pretrained(
+                str(save_dir / "controlnet"), safe_serialization=to_safetensors
+            )
+        else:
+            self.unet.save_pretrained(
+                str(save_dir / "unet"), safe_serialization=to_safetensors
+            )
+    def train(self, num_train_epochs=1000):
+        args = self.args
+        max_train_steps = (
+            num_train_epochs
+            * len(self.train_dataloader)
+            // args.gradient_accumulation_steps
+        )
+        if self.is_main_process:
+            self.init_trackers("tb_logs", config=vars(args))
+        self.global_step = 0
+        # Only show the progress bar once on each machine.
+        progress_bar = tqdm(
+            range(max_train_steps),
+            disable=not self.is_local_main_process,
+        )
+        progress_bar.set_description("Steps")
+        try:
+            for epoch in range(num_train_epochs):
+                # run training loop
+                if self.controlnet:
+                    self.controlnet.train()
+                else:
+                    self.unet.train()
+                for batch in self.train_dataloader:
+                    loss, encoder_hidden_states = self.compute_loss(batch)
+                    loss /= args.gradient_accumulation_steps
+                    self.backward(loss)
+                    if self.global_step % args.gradient_accumulation_steps == 0:
+                        if self.sync_gradients:
+                            self.clip_grad_norm_(
+                                self.params_to_optimize, args.max_grad_norm
+                            )
+                        self.optimizer.step()
+                        self.optimizer.zero_grad()
+                    # Checks if the accelerator has performed an optimization step behind the scenes
+                    if self.sync_gradients:
+                        progress_bar.update(1)
+                        self.global_step += 1
+                        if self.is_main_process:
+                            if self.global_step % args.checkpointing_steps == 0:
+                                self.save_weights()
+                            if args.image_logging_steps and (
+                                self.global_step % args.image_logging_steps == 0
+                                or self.global_step == 1
+                            ):
+                                self.log_images(batch, encoder_hidden_states)
+                    logs = {"training_loss": loss.detach().item()}
+                    self.log(logs, step=self.global_step)
+                    progress_bar.set_postfix(**logs)
+                    if self.global_step >= max_train_steps:
+                        break
+                self.wait_for_everyone()
+                # run validation loop
+                if args.valid_data_dir:
+                    total_valid_loss = 0
+                    if self.controlnet:
+                        self.controlnet.eval()
+                    else:
+                        self.unet.eval()
+                    for batch in self.valid_dataloader:
+                        with torch.no_grad():
+                            loss, encoder_hidden_states = self.compute_loss(batch)
+                        loss = loss.detach().item()
+                        total_valid_loss += loss
+                        logs = {"validation_loss": loss}
+                        progress_bar.set_postfix(**logs)
+                    self.log(
+                        {
+                            "validation_loss": total_valid_loss
+                            / len(self.valid_dataloader)
+                        },
+                        step=self.global_step,
+                    )
+                    self.wait_for_everyone()
+        except KeyboardInterrupt:
+            print("Keyboard interrupt detected, attempting to save trained weights")
+        # except Exception as e:
+        #    print(f"Encountered error {e}, attempting to save trained weights")
+        self.save_weights()
+        self.end_training()

src/ui_assets/controlnet_ids.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+anime_merge
+anime_dream
+anime_protogen
+anime_neverending

src/ui_assets/examples ADDED Viewed

	@@ -0,0 +1 @@


1	+ ../../examples

src/ui_assets/footer.html ADDED Viewed

	@@ -0,0 +1,9 @@

+<!-- based on https://huggingface.co/spaces/stabilityai/stable-diffusion/blob/main/app.py -->
+<div class="footer">
+    <p><h4>LICENSE</h4>
+The default model is licensed with a <a href="https://huggingface.co/stabilityai/stable-diffusion-2/blob/main/LICENSE-MODEL" style="text-decoration: underline;" target="_blank">CreativeML OpenRAIL++</a> license. The authors claim no rights on the outputs you generate, you are free to use them and are accountable for their use which must not go against the provisions set in this license. The license forbids you from sharing any content that violates any laws, produce any harm to a person, disseminate any personal information that would be meant for harm, spread misinformation and target vulnerable groups. For the full list of restrictions please <a href="https://huggingface.co/spaces/CompVis/stable-diffusion-license" target="_blank" style="text-decoration: underline;" target="_blank">read the license</a></p>
+</div>

src/ui_assets/header.html ADDED Viewed

	@@ -0,0 +1,23 @@

+<!-- based on https://huggingface.co/spaces/stabilityai/stable-diffusion/blob/main/app.py -->
+<div style="text-align: center; margin: 0 auto;">
+    <div
+      style="
+        display: inline-flex;
+        align-items: center;
+        gap: 0.8rem;
+        font-size: 1.75rem;
+      "
+    >
+    <svg version="1.1" id="Layer_1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" x="0px" y="0px" viewBox="0 0 32 32" style="enable-background:new 0 0 512 512;" xml:space="preserve" width="32" height="32"><path style="fill:#FCD577;" d="M29.545 29.791V2.21c-1.22 0 -2.21 -0.99 -2.21 -2.21H4.665c0 1.22 -0.99 2.21 -2.21 2.21v27.581c1.22 0 2.21 0.99 2.21 2.21H27.335C27.335 30.779 28.325 29.791 29.545 29.791z"/><path x="98.205" y="58.928" style="fill:#99B6C6;" width="315.577" height="394.144" d="M6.138 3.683H25.861V28.317H6.138V3.683z"/><path x="98.205" y="58.928" style="fill:#7BD4EF;" width="315.577" height="131.317" d="M6.138 3.683H25.861V11.89H6.138V3.683z"/><g><path style="fill:#7190A5;" d="M14.498 10.274c0 1.446 0.983 1.155 1.953 1.502l0.504 5.317c0 0 -5.599 0.989 -6.026 2.007l0.27 -2.526c0.924 -1.462 1.286 -4.864 1.419 -6.809l0.086 0.006C12.697 9.876 14.498 10.166 14.498 10.274z"/><path style="fill:#7190A5;" d="M21.96 17.647c0 0 -0.707 1.458 -1.716 1.903c0 0 -1.502 -0.827 -1.502 -0.827c-2.276 -1.557 -2.366 -8.3 -2.366 -8.3c0 -1.718 -0.185 -1.615 -1.429 -1.615c-1.167 0 -2.127 -0.606 -2.242 0.963l-0.086 -0.006c0.059 -0.859 0.074 -1.433 0.074 -1.433c0 -1.718 1.449 -3.11 3.237 -3.11s3.237 1.392 3.237 3.11C19.168 8.332 19.334 15.617 21.96 17.647z"/></g><path style="fill:#6C8793;" d="M12.248 24.739c1.538 0.711 3.256 1.591 3.922 2.258c-1.374 0.354 -2.704 0.798 -3.513 1.32h-2.156c-1.096 -0.606 -2.011 -1.472 -2.501 -2.702c-1.953 -4.907 2.905 -8.664 2.905 -8.664c0.001 -0.001 0.002 -0.002 0.003 -0.003c0.213 -0.214 0.523 -0.301 0.811 -0.21l0.02 0.006c-0.142 0.337 -0.03 0.71 0.517 1.108c1.264 0.919 3.091 1.131 4.416 1.143c-1.755 1.338 -3.42 3.333 -4.367 5.618L12.248 24.739z"/><path style="fill:#577484;" d="M16.17 26.997c-0.666 -0.666 -2.385 -1.548 -3.922 -2.258l0.059 -0.126c0.947 -2.284 2.612 -4.28 4.367 -5.618c0.001 0 0.001 0 0.001 0c0.688 -0.525 1.391 -0.948 2.068 -1.247c0.001 0 0.001 0 0.001 0c1.009 -0.446 1.964 -0.617 2.742 -0.44c0.61 0.138 1.109 0.492 1.439 1.095c1.752 3.205 0.601 9.913 0.601 9.913H12.657C13.466 27.796 14.796 27.352 16.17 26.997z"/><path style="fill:#F7DEB0;" d="M14.38 13.1c-0.971 -0.347 -1.687 -1.564 -1.687 -3.01c0 -0.107 0.004 -0.213 0.011 -0.318c0.116 -1.569 1.075 -2.792 2.242 -2.792c1.244 0 2.253 1.392 2.253 3.11c0 0 -0.735 6.103 1.542 7.66c-0.677 0.299 -1.38 0.722 -2.068 1.247c0 0 0 0 -0.001 0c-1.326 -0.012 -3.152 -0.223 -4.416 -1.143c-0.547 -0.398 -0.659 -0.771 -0.517 -1.108c0.426 -1.018 3.171 -1.697 3.171 -1.697L14.38 13.1z"/><path style="fill:#E5CA9E;" d="M14.38 13.1c0 0 1.019 0.216 1.544 -0.309c0 0 -0.401 1.04 -1.346 1.04"/><g><path style="fill:#EAC36E;" points="437.361,0 413.79,58.926 472.717,35.356 	" d="M27.335 0L25.862 3.683L29.545 2.21"/><path style="fill:#EAC36E;" points="437.361,512 413.79,453.074 472.717,476.644 	" d="M27.335 32L25.862 28.317L29.545 29.791"/><path style="fill:#EAC36E;" points="74.639,512 98.21,453.074 39.283,476.644 	" d="M4.665 32L6.138 28.317L2.455 29.791"/><path style="fill:#EAC36E;" points="39.283,35.356 98.21,58.926 74.639,0 	" d="M2.455 2.21L6.138 3.683L4.665 0"/><path style="fill:#EAC36E;" d="M26.425 28.881H5.574V3.119h20.851v25.761H26.425zM6.702 27.754h18.597V4.246H6.702V27.754z"/></g><g><path style="fill:#486572;" d="M12.758 21.613c-0.659 0.767 -1.245 1.613 -1.722 2.531l0.486 0.202C11.82 23.401 12.241 22.483 12.758 21.613z"/><path style="fill:#486572;" d="M21.541 25.576l-0.37 0.068c-0.553 0.101 -1.097 0.212 -1.641 0.331l-0.071 -0.201l-0.059 -0.167c-0.019 -0.056 -0.035 -0.112 -0.052 -0.169l-0.104 -0.338l-0.088 -0.342c-0.112 -0.457 -0.197 -0.922 -0.235 -1.393c-0.035 -0.47 -0.032 -0.947 0.042 -1.417c0.072 -0.47 0.205 -0.935 0.422 -1.369c-0.272 0.402 -0.469 0.856 -0.606 1.329c-0.138 0.473 -0.207 0.967 -0.234 1.462c-0.024 0.496 0.002 0.993 0.057 1.487l0.046 0.37l0.063 0.367c0.011 0.061 0.02 0.123 0.033 0.184l0.039 0.182l0.037 0.174c-0.677 0.157 -1.351 0.327 -2.019 0.514c-0.131 0.037 -0.262 0.075 -0.392 0.114l0.004 -0.004c-0.117 -0.095 -0.232 -0.197 -0.35 -0.275c-0.059 -0.041 -0.117 -0.084 -0.177 -0.122l-0.179 -0.112c-0.239 -0.147 -0.482 -0.279 -0.727 -0.406c-0.489 -0.252 -0.985 -0.479 -1.484 -0.697c-0.998 -0.433 -2.01 -0.825 -3.026 -1.196c0.973 0.475 1.937 0.969 2.876 1.499c0.469 0.266 0.932 0.539 1.379 0.832c0.223 0.146 0.442 0.297 0.648 0.456l0.154 0.119c0.05 0.041 0.097 0.083 0.145 0.124c0.002 0.002 0.004 0.003 0.005 0.005c-0.339 0.109 -0.675 0.224 -1.009 0.349c-0.349 0.132 -0.696 0.273 -1.034 0.431c-0.338 0.159 -0.668 0.337 -0.973 0.549c0.322 -0.186 0.662 -0.334 1.01 -0.463c0.347 -0.129 0.701 -0.239 1.056 -0.34c0.394 -0.111 0.79 -0.208 1.19 -0.297c0.006 0.006 0.013 0.013 0.019 0.019l0.03 -0.03c0.306 -0.068 0.614 -0.132 0.922 -0.192c0.727 -0.14 1.457 -0.258 2.189 -0.362c0.731 -0.103 1.469 -0.195 2.197 -0.265l0.374 -0.036L21.541 25.576z"/></g></svg>
+      <h1 style="font-weight: 1000; margin-bottom: 8px;margin-top:8px">
+        <a href="https://github.com/1lint/style_controlnet">
+        Style ControlNet Web UI
+        </a>
+      </h1>
+    </div>
+    <p> Use the ControlNet architecture to control Stable Diffusion image generation style</p>
+  </div>

src/ui_assets/model_ids.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+lint/liquidfix
+prompthero/openjourney-v2
+Lykon/DreamShaper
+darkstorm2150/Protogen_x5.8_Official_Release
+runwayml/stable-diffusion-v1-5

src/ui_functions.py ADDED Viewed

	@@ -0,0 +1,285 @@

+import gradio as gr
+import torch
+import random
+from PIL import Image
+import os
+import argparse
+import shutil
+import gc
+import importlib
+import json
+from multiprocessing import cpu_count
+import cv2
+import numpy as np
+from pathlib import Path
+from diffusers import (
+    StableDiffusionControlNetPipeline,
+    StableDiffusionPipeline,
+    ControlNetModel,
+    AutoencoderKL,
+)
+from src.controlnet_pipe import ControlNetPipe as StableDiffusionControlNetPipeline
+from src.lab import Lab
+from src.ui_shared import (
+    default_scheduler,
+    scheduler_dict,
+    model_ids,
+    controlnet_ids,
+    is_hfspace,
+)
+CONTROLNET_REPO = "lint/anime_control"
+_xformers_available = importlib.util.find_spec("xformers") is not None
+device = "cuda" if torch.cuda.is_available() else "cpu"
+# device = 'cpu'
+dtype = torch.float16 if device == "cuda" else torch.float32
+pipe = None
+loaded_model_id = ""
+loaded_controlnet_id = ""
+def load_pipe(model_id, controlnet_id, scheduler_name):
+    global pipe, loaded_model_id, loaded_controlnet_id
+    scheduler = scheduler_dict[scheduler_name]
+    reload_pipe = False
+    if pipe:
+        new_weights = pipe.components
+    else:
+        new_weights = {}
+    if model_id != loaded_model_id:
+        new_pipe = StableDiffusionPipeline.from_pretrained(
+            model_id,
+            vae=AutoencoderKL.from_pretrained("lint/anime_vae", torch_dtype=dtype),
+            safety_checker=None,
+            feature_extractor=None,
+            requires_safety_checker=False,
+            use_safetensors=False,
+            torch_dtype=dtype,
+        )
+        loaded_model_id = model_id
+        new_weights.update(new_pipe.components)
+        new_weights["scheduler"] = scheduler.from_pretrained(model_id, subfolder="scheduler")
+        reload_pipe = True
+    if controlnet_id != loaded_controlnet_id:
+        controlnet = ControlNetModel.from_pretrained(
+            CONTROLNET_REPO,
+            subfolder=controlnet_id,
+            torch_dtype=dtype,
+        )
+        loaded_controlnet_id = controlnet_id
+        new_weights["controlnet"] = controlnet
+        reload_pipe = True
+    if reload_pipe:
+        pipe = StableDiffusionControlNetPipeline(
+            **new_weights,
+            requires_safety_checker=False,
+        )
+    if device == "cuda":
+        for component in pipe.components.values():
+            if isinstance(component, torch.nn.Module):
+                component.to("cuda", torch.float16)
+        if _xformers_available:
+            pipe.enable_xformers_memory_efficient_attention()
+        pipe.enable_attention_slicing()
+        pipe.enable_vae_tiling()
+    return pipe
+# initialize with preloaded pipe
+if is_hfspace:
+    pipe = load_pipe(model_ids[0], controlnet_ids[0], default_scheduler)
+def extract_canny(image):
+    CANNY_THRESHOLD = (100, 200)
+    image_array = np.asarray(image)
+    canny_image = cv2.Canny(image_array, *CANNY_THRESHOLD)
+    canny_image = canny_image[:, :, None]
+    canny_image = np.concatenate([canny_image]*3, axis=2)
+    return Image.fromarray(canny_image)
+@torch.no_grad()
+def generate(
+    model_name,
+    guidance_image,
+    controlnet_name,
+    scheduler_name,
+    prompt,
+    guidance,
+    steps,
+    n_images=1,
+    width=512,
+    height=512,
+    seed=0,
+    neg_prompt="",
+    controlnet_prompt=None,
+    controlnet_negative_prompt=None,
+    controlnet_cond_scale=1.0,
+    progress=gr.Progress(track_tqdm=True),
+):
+    if seed == -1:
+        seed = random.randint(0, 2147483647)
+    if guidance_image:
+        guiadnce_image = extract_canny(guidance_image)
+    else:
+        guidance_image = torch.zeros(n_images, 3, height, width)
+    generator = torch.Generator(device).manual_seed(seed)
+    pipe = load_pipe(
+        model_id=model_name,
+        controlnet_id=controlnet_name,
+        scheduler_name=scheduler_name,
+    )
+    status_message = f"Prompt: '{prompt}' | Seed: {seed} | Guidance: {guidance} | Scheduler: {scheduler_name} | Steps: {steps}"
+    # pass None so pipeline uses base prompt as controlnet_prompt
+    if controlnet_prompt == "":
+        controlnet_prompt = None  #
+    if controlnet_negative_prompt == "":
+        controlnet_negative_prompt = None
+    if controlnet_prompt:
+        controlnet_prompt_embeds = pipe._encode_prompt(
+            controlnet_prompt,
+            device,
+            n_images,
+            do_classifier_free_guidance = guidance > 1.0,
+            negative_prompt = controlnet_negative_prompt,
+            prompt_embeds=None,
+            negative_prompt_embeds=None,
+        )
+    else:
+        controlnet_prompt_embeds = None
+    result = pipe(
+        prompt,
+        image=guidance_image,
+        height=height,
+        width=width,
+        num_inference_steps=int(steps),
+        guidance_scale=guidance,
+        negative_prompt=neg_prompt,
+        num_images_per_prompt=n_images,
+        generator=generator,
+        controlnet_conditioning_scale = float(controlnet_cond_scale),
+        controlnet_prompt_embeds = controlnet_prompt_embeds,
+    )
+    return result.images, status_message
+def run_training(
+    model_name,
+    controlnet_weights_path,
+    train_data_dir,
+    valid_data_dir,
+    train_batch_size,
+    train_whole_controlnet,
+    gradient_accumulation_steps,
+    max_train_steps,
+    train_learning_rate,
+    output_dir,
+    checkpointing_steps,
+    image_logging_steps,
+    save_whole_pipeline,
+    progress=gr.Progress(track_tqdm=True),
+):
+    global pipe
+    if device == "cpu":
+        raise gr.Error("Training not supported on CPU")
+    pathobj = Path(controlnet_weights_path)
+    controlnet_path = str(Path().joinpath(*pathobj.parts[:-1]))
+    subfolder = str(pathobj.parts[-1])
+    controlnet = ControlNetModel.from_pretrained(
+        controlnet_path,
+        subfolder=subfolder,
+        low_cpu_mem_usage=False,
+        device_map=None,
+    )
+    pipe.components["controlnet"] = controlnet
+    pipe = StableDiffusionControlNetPipeline(
+        **pipe.components,
+        requires_safety_checker=False,
+    )
+    training_args = argparse.Namespace(
+        # start training from preexisting models
+        pretrained_model_name_or_path=None,
+        controlnet_weights_path=None,
+        # dataset args
+        train_data_dir=train_data_dir,
+        valid_data_dir=valid_data_dir,
+        resolution=512,
+        from_hf_hub = train_data_dir == "lint/anybooru",
+        controlnet_hint_key=None,
+        # training args
+        # options are ["zero convolutions", "input hint blocks"], trains whole controlnet by default
+        training_stage="" if train_whole_controlnet else "zero convolutions",
+        learning_rate=float(train_learning_rate),
+        num_train_epochs=1000,
+        max_train_steps=int(max_train_steps),
+        seed=3434554,
+        max_grad_norm=1.0,
+        gradient_accumulation_steps=int(gradient_accumulation_steps),
+        # VRAM args
+        batch_size=train_batch_size,
+        mixed_precision="fp16",  # set to "fp16" for mixed-precision training.
+        gradient_checkpointing=True,  # set this to True to lower the memory usage.
+        use_8bit_adam=False,  # use 8bit optimizer from bitsandbytes
+        enable_xformers_memory_efficient_attention=True,
+        allow_tf32=True,
+        dataloader_num_workers=cpu_count(),
+        # logging args
+        output_dir=output_dir,
+        report_to="tensorboard",
+        image_logging_steps=image_logging_steps,  # disabled when 0. costs additional VRAM to log images
+        save_whole_pipeline=save_whole_pipeline,
+        checkpointing_steps=checkpointing_steps,
+    )
+    try:
+        lab = Lab(training_args, pipe)
+        lab.train(training_args.num_train_epochs)
+    except Exception as e:
+        raise gr.Error(e)
+    for component in pipe.components.values():
+        if isinstance(component, torch.nn.Module):
+            component.to(device, dtype=dtype)
+    gc.collect()
+    torch.cuda.empty_cache()
+    return f"Finished training! Check the {training_args.output_dir} directory for saved model weights"

src/ui_shared.py ADDED Viewed

	@@ -0,0 +1,24 @@

+import diffusers.schedulers
+import os
+from pathlib import Path
+assets_directory = Path(__file__).parent / "ui_assets"
+is_hfspace = "SPACE_REPO_NAME" in os.environ
+scheduler_dict = {
+    k: v
+    for k, v in diffusers.schedulers.__dict__.items()
+    if "Scheduler" in k and "Flax" not in k
+}
+scheduler_dict.pop(
+    "VQDiffusionScheduler", None
+)  # requires unique parameter, unlike other schedulers
+scheduler_names = list(scheduler_dict.keys())
+default_scheduler = "UniPCMultistepScheduler"
+with open(assets_directory / "model_ids.txt", "r") as fp:
+    model_ids = fp.read().splitlines()
+with open(assets_directory / "controlnet_ids.txt", "r") as fp:
+    controlnet_ids = fp.read().splitlines()