sdfinetuned

Runtime error

App Files Files Community

Jackflack09

lint commited on Jun 9, 2023

Commit

001c876

0 Parent(s):

Duplicate from lint/sdpipe_webui

Browse files

Co-authored-by: lint <[email protected]>

Files changed (15) hide show

.gitattributes +34 -0
.gitignore +4 -0
README.md +22 -0
app.py +225 -0
html/footer.html +15 -0
html/header.html +25 -0
html/style.css +38 -0
model_ids.txt +6 -0
requirements.txt +12 -0
test.ipynb +73 -0
utils/__init__.py +0 -0
utils/functions.py +273 -0
utils/inpaint_pipeline.py +288 -0
utils/shared.py +16 -0
utils/textual_inversion.py +916 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,34 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,4 @@

+__pycache__/
+WIP/
+concept_images/
+output_model/

README.md ADDED Viewed

	@@ -0,0 +1,22 @@

+---
+title: Sdpipe Webui
+emoji: 🍌
+colorFrom: gray
+colorTo: green
+sdk: gradio
+sdk_version: 3.16.2
+app_file: app.py
+pinned: false
+license: openrail
+duplicated_from: lint/sdpipe_webui
+---
+# **Stable Diffusion Pipeline Web UI**
+Stable Diffusion WebUI with first class support for HuggingFace Diffusers Pipelines and Diffusion Schedulers, made in the style of Automatic1111's WebUI and Evel_Space.
+Supports Huggingface `Text-to-Image`, `Image to Image`, and `Inpainting` pipelines, with fast switching between pipeline modes by reusing loaded model weights already in memory.
+Install requirements with `pip install -r requirements.txt`
+Run with `python app.py`

app.py ADDED Viewed

	@@ -0,0 +1,225 @@

+import gradio as gr
+from multiprocessing import cpu_count
+from utils.functions import generate, train_textual_inversion
+from utils.shared import model_ids, scheduler_names, default_scheduler
+default_img_size = 512
+with open("html/header.html") as fp:
+    header = fp.read()
+with open("html/footer.html") as fp:
+    footer = fp.read()
+with gr.Blocks(css="html/style.css") as demo:
+    pipe_state = gr.State(lambda: 1)
+    gr.HTML(header)
+    with gr.Row():
+        with gr.Column(scale=70):
+            # with gr.Row():
+            prompt = gr.Textbox(
+                label="Prompt", placeholder="<Shift+Enter> to generate", lines=2
+            )
+            neg_prompt = gr.Textbox(label="Negative Prompt", placeholder="", lines=2)
+        with gr.Column(scale=30):
+            model_name = gr.Dropdown(
+                label="Model", choices=model_ids, value=model_ids[0]
+            )
+            scheduler_name = gr.Dropdown(
+                label="Scheduler", choices=scheduler_names, value=default_scheduler
+            )
+            generate_button = gr.Button(value="Generate", elem_id="generate-button")
+    with gr.Row():
+        with gr.Column():
+            with gr.Tab("Text to Image") as tab:
+                tab.select(lambda: 1, [], pipe_state)
+            with gr.Tab("Image to image") as tab:
+                tab.select(lambda: 2, [], pipe_state)
+                image = gr.Image(
+                    label="Image to Image",
+                    source="upload",
+                    tool="editor",
+                    type="pil",
+                    elem_id="image_upload",
+                ).style(height=default_img_size)
+                strength = gr.Slider(
+                    label="Denoising strength",
+                    minimum=0,
+                    maximum=1,
+                    step=0.02,
+                    value=0.8,
+                )
+            with gr.Tab("Inpainting") as tab:
+                tab.select(lambda: 3, [], pipe_state)
+                inpaint_image = gr.Image(
+                    label="Inpainting",
+                    source="upload",
+                    tool="sketch",
+                    type="pil",
+                    elem_id="image_upload",
+                ).style(height=default_img_size)
+                inpaint_strength = gr.Slider(
+                    label="Denoising strength",
+                    minimum=0,
+                    maximum=1,
+                    step=0.02,
+                    value=0.8,
+                )
+                inpaint_options = [
+                    "preserve non-masked portions of image",
+                    "output entire inpainted image",
+                ]
+                inpaint_radio = gr.Radio(
+                    inpaint_options,
+                    value=inpaint_options[0],
+                    show_label=False,
+                    interactive=True,
+                )
+            with gr.Tab("Textual Inversion") as tab:
+                tab.select(lambda: 4, [], pipe_state)
+                type_of_thing = gr.Dropdown(
+                    label="What would you like to train?",
+                    choices=["object", "person", "style"],
+                    value="object",
+                    interactive=True,
+                )
+                text_train_bsz = gr.Slider(
+                    label="Training Batch Size",
+                    minimum=1,
+                    maximum=8,
+                    step=1,
+                    value=1,
+                )
+                files = gr.File(
+                    label=f"""Upload the images for your concept""",
+                    file_count="multiple",
+                    interactive=True,
+                    visible=True,
+                )
+                text_train_steps = gr.Number(label="How many steps", value=1000)
+                text_learning_rate = gr.Number(label="Learning Rate", value=5.0e-4)
+                concept_word = gr.Textbox(
+                    label=f"""concept word - use a unique, made up word to avoid collisions"""
+                )
+                init_word = gr.Textbox(
+                    label=f"""initial word - to init the concept embedding"""
+                )
+                textual_inversion_button = gr.Button(value="Train Textual Inversion")
+                training_status = gr.Text(label="Training Status")
+            with gr.Row():
+                batch_size = gr.Slider(
+                    label="Batch Size", value=1, minimum=1, maximum=8, step=1
+                )
+                seed = gr.Slider(-1, 2147483647, label="Seed", value=-1, step=1)
+            with gr.Row():
+                guidance = gr.Slider(
+                    label="Guidance scale", value=7.5, minimum=0, maximum=20
+                )
+                steps = gr.Slider(
+                    label="Steps", value=20, minimum=1, maximum=100, step=1
+                )
+            with gr.Row():
+                width = gr.Slider(
+                    label="Width",
+                    value=default_img_size,
+                    minimum=64,
+                    maximum=1024,
+                    step=32,
+                )
+                height = gr.Slider(
+                    label="Height",
+                    value=default_img_size,
+                    minimum=64,
+                    maximum=1024,
+                    step=32,
+                )
+        with gr.Column():
+            gallery = gr.Gallery(
+                label="Generated images", show_label=False, elem_id="gallery"
+            ).style(height=default_img_size, grid=2)
+            generation_details = gr.Markdown()
+            pipe_kwargs = gr.Textbox(label="Pipe kwargs", value="{\n\t\n}")
+            # if torch.cuda.is_available():
+            #  giga = 2**30
+            #  vram_guage = gr.Slider(0, torch.cuda.memory_reserved(0)/giga, label='VRAM Allocated to Reserved (GB)', value=0, step=1)
+            #  demo.load(lambda : torch.cuda.memory_allocated(0)/giga, inputs=[], outputs=vram_guage, every=0.5, show_progress=False)
+    gr.HTML(footer)
+    inputs = [
+        model_name,
+        scheduler_name,
+        prompt,
+        guidance,
+        steps,
+        batch_size,
+        width,
+        height,
+        seed,
+        image,
+        strength,
+        inpaint_image,
+        inpaint_strength,
+        inpaint_radio,
+        neg_prompt,
+        pipe_state,
+        pipe_kwargs,
+    ]
+    outputs = [gallery, generation_details]
+    prompt.submit(generate, inputs=inputs, outputs=outputs)
+    generate_button.click(generate, inputs=inputs, outputs=outputs)
+    textual_inversion_inputs = [
+        model_name,
+        scheduler_name,
+        type_of_thing,
+        files,
+        concept_word,
+        init_word,
+        text_train_steps,
+        text_train_bsz,
+        text_learning_rate,
+    ]
+    textual_inversion_button.click(
+        train_textual_inversion,
+        inputs=textual_inversion_inputs,
+        outputs=[training_status],
+    )
+# demo = gr.TabbedInterface([demo, dreambooth_tab], ["Main", "Dreambooth"])
+demo.queue(concurrency_count=cpu_count())
+demo.launch()

html/footer.html ADDED Viewed

	@@ -0,0 +1,15 @@

+<!-- based on https://huggingface.co/spaces/stabilityai/stable-diffusion/blob/main/app.py -->
+<div class="footer">
+    <p>Model Architecture by <a href="https://huggingface.co/stabilityai" style="text-decoration: underline;" target="_blank">StabilityAI</a> - Pipelines by 🤗 Hugging Face
+    </p>
+</div>
+<div class="acknowledgments">
+    <p><h4>LICENSE</h4>
+The model is licensed with a <a href="https://huggingface.co/stabilityai/stable-diffusion-2/blob/main/LICENSE-MODEL" style="text-decoration: underline;" target="_blank">CreativeML OpenRAIL++</a> license. The authors claim no rights on the outputs you generate, you are free to use them and are accountable for their use which must not go against the provisions set in this license. The license forbids you from sharing any content that violates any laws, produce any harm to a person, disseminate any personal information that would be meant for harm, spread misinformation and target vulnerable groups. For the full list of restrictions please <a href="https://huggingface.co/spaces/CompVis/stable-diffusion-license" target="_blank" style="text-decoration: underline;" target="_blank">read the license</a></p>
+    <p><h4>Biases and content acknowledgment</h4>
+Despite how impressive being able to turn text into image is, beware to the fact that this model may output content that reinforces or exacerbates societal biases, as well as realistic faces, pornography and violence. The model was trained on the <a href="https://laion.ai/blog/laion-5b/" style="text-decoration: underline;" target="_blank">LAION-5B dataset</a>, which scraped non-curated image-text-pairs from the internet (the exception being the removal of illegal content) and is meant for research purposes. You can read more in the <a href="https://huggingface.co/CompVis/stable-diffusion-v1-4" style="text-decoration: underline;" target="_blank">model card</a></p>
+</div>

html/header.html ADDED Viewed

	@@ -0,0 +1,25 @@

+<!-- based on https://huggingface.co/spaces/stabilityai/stable-diffusion/blob/main/app.py -->
+<div style="text-align: center; margin: 0 auto;">
+    <div
+      style="
+        display: inline-flex;
+        align-items: center;
+        gap: 0.8rem;
+        font-size: 1.75rem;
+      "
+    >
+    <svg version="1.1" id="Layer_1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" x="0px" y="0px" viewBox="0 0 32 32" style="enable-background:new 0 0 512 512;" xml:space="preserve" width="32" height="32"><path style="fill:#FCD577;" d="M29.545 29.791V2.21c-1.22 0 -2.21 -0.99 -2.21 -2.21H4.665c0 1.22 -0.99 2.21 -2.21 2.21v27.581c1.22 0 2.21 0.99 2.21 2.21H27.335C27.335 30.779 28.325 29.791 29.545 29.791z"/><path x="98.205" y="58.928" style="fill:#99B6C6;" width="315.577" height="394.144" d="M6.138 3.683H25.861V28.317H6.138V3.683z"/><path x="98.205" y="58.928" style="fill:#7BD4EF;" width="315.577" height="131.317" d="M6.138 3.683H25.861V11.89H6.138V3.683z"/><g><path style="fill:#7190A5;" d="M14.498 10.274c0 1.446 0.983 1.155 1.953 1.502l0.504 5.317c0 0 -5.599 0.989 -6.026 2.007l0.27 -2.526c0.924 -1.462 1.286 -4.864 1.419 -6.809l0.086 0.006C12.697 9.876 14.498 10.166 14.498 10.274z"/><path style="fill:#7190A5;" d="M21.96 17.647c0 0 -0.707 1.458 -1.716 1.903c0 0 -1.502 -0.827 -1.502 -0.827c-2.276 -1.557 -2.366 -8.3 -2.366 -8.3c0 -1.718 -0.185 -1.615 -1.429 -1.615c-1.167 0 -2.127 -0.606 -2.242 0.963l-0.086 -0.006c0.059 -0.859 0.074 -1.433 0.074 -1.433c0 -1.718 1.449 -3.11 3.237 -3.11s3.237 1.392 3.237 3.11C19.168 8.332 19.334 15.617 21.96 17.647z"/></g><path style="fill:#6C8793;" d="M12.248 24.739c1.538 0.711 3.256 1.591 3.922 2.258c-1.374 0.354 -2.704 0.798 -3.513 1.32h-2.156c-1.096 -0.606 -2.011 -1.472 -2.501 -2.702c-1.953 -4.907 2.905 -8.664 2.905 -8.664c0.001 -0.001 0.002 -0.002 0.003 -0.003c0.213 -0.214 0.523 -0.301 0.811 -0.21l0.02 0.006c-0.142 0.337 -0.03 0.71 0.517 1.108c1.264 0.919 3.091 1.131 4.416 1.143c-1.755 1.338 -3.42 3.333 -4.367 5.618L12.248 24.739z"/><path style="fill:#577484;" d="M16.17 26.997c-0.666 -0.666 -2.385 -1.548 -3.922 -2.258l0.059 -0.126c0.947 -2.284 2.612 -4.28 4.367 -5.618c0.001 0 0.001 0 0.001 0c0.688 -0.525 1.391 -0.948 2.068 -1.247c0.001 0 0.001 0 0.001 0c1.009 -0.446 1.964 -0.617 2.742 -0.44c0.61 0.138 1.109 0.492 1.439 1.095c1.752 3.205 0.601 9.913 0.601 9.913H12.657C13.466 27.796 14.796 27.352 16.17 26.997z"/><path style="fill:#F7DEB0;" d="M14.38 13.1c-0.971 -0.347 -1.687 -1.564 -1.687 -3.01c0 -0.107 0.004 -0.213 0.011 -0.318c0.116 -1.569 1.075 -2.792 2.242 -2.792c1.244 0 2.253 1.392 2.253 3.11c0 0 -0.735 6.103 1.542 7.66c-0.677 0.299 -1.38 0.722 -2.068 1.247c0 0 0 0 -0.001 0c-1.326 -0.012 -3.152 -0.223 -4.416 -1.143c-0.547 -0.398 -0.659 -0.771 -0.517 -1.108c0.426 -1.018 3.171 -1.697 3.171 -1.697L14.38 13.1z"/><path style="fill:#E5CA9E;" d="M14.38 13.1c0 0 1.019 0.216 1.544 -0.309c0 0 -0.401 1.04 -1.346 1.04"/><g><path style="fill:#EAC36E;" points="437.361,0 413.79,58.926 472.717,35.356 	" d="M27.335 0L25.862 3.683L29.545 2.21"/><path style="fill:#EAC36E;" points="437.361,512 413.79,453.074 472.717,476.644 	" d="M27.335 32L25.862 28.317L29.545 29.791"/><path style="fill:#EAC36E;" points="74.639,512 98.21,453.074 39.283,476.644 	" d="M4.665 32L6.138 28.317L2.455 29.791"/><path style="fill:#EAC36E;" points="39.283,35.356 98.21,58.926 74.639,0 	" d="M2.455 2.21L6.138 3.683L4.665 0"/><path style="fill:#EAC36E;" d="M26.425 28.881H5.574V3.119h20.851v25.761H26.425zM6.702 27.754h18.597V4.246H6.702V27.754z"/></g><g><path style="fill:#486572;" d="M12.758 21.613c-0.659 0.767 -1.245 1.613 -1.722 2.531l0.486 0.202C11.82 23.401 12.241 22.483 12.758 21.613z"/><path style="fill:#486572;" d="M21.541 25.576l-0.37 0.068c-0.553 0.101 -1.097 0.212 -1.641 0.331l-0.071 -0.201l-0.059 -0.167c-0.019 -0.056 -0.035 -0.112 -0.052 -0.169l-0.104 -0.338l-0.088 -0.342c-0.112 -0.457 -0.197 -0.922 -0.235 -1.393c-0.035 -0.47 -0.032 -0.947 0.042 -1.417c0.072 -0.47 0.205 -0.935 0.422 -1.369c-0.272 0.402 -0.469 0.856 -0.606 1.329c-0.138 0.473 -0.207 0.967 -0.234 1.462c-0.024 0.496 0.002 0.993 0.057 1.487l0.046 0.37l0.063 0.367c0.011 0.061 0.02 0.123 0.033 0.184l0.039 0.182l0.037 0.174c-0.677 0.157 -1.351 0.327 -2.019 0.514c-0.131 0.037 -0.262 0.075 -0.392 0.114l0.004 -0.004c-0.117 -0.095 -0.232 -0.197 -0.35 -0.275c-0.059 -0.041 -0.117 -0.084 -0.177 -0.122l-0.179 -0.112c-0.239 -0.147 -0.482 -0.279 -0.727 -0.406c-0.489 -0.252 -0.985 -0.479 -1.484 -0.697c-0.998 -0.433 -2.01 -0.825 -3.026 -1.196c0.973 0.475 1.937 0.969 2.876 1.499c0.469 0.266 0.932 0.539 1.379 0.832c0.223 0.146 0.442 0.297 0.648 0.456l0.154 0.119c0.05 0.041 0.097 0.083 0.145 0.124c0.002 0.002 0.004 0.003 0.005 0.005c-0.339 0.109 -0.675 0.224 -1.009 0.349c-0.349 0.132 -0.696 0.273 -1.034 0.431c-0.338 0.159 -0.668 0.337 -0.973 0.549c0.322 -0.186 0.662 -0.334 1.01 -0.463c0.347 -0.129 0.701 -0.239 1.056 -0.34c0.394 -0.111 0.79 -0.208 1.19 -0.297c0.006 0.006 0.013 0.013 0.019 0.019l0.03 -0.03c0.306 -0.068 0.614 -0.132 0.922 -0.192c0.727 -0.14 1.457 -0.258 2.189 -0.362c0.731 -0.103 1.469 -0.195 2.197 -0.265l0.374 -0.036L21.541 25.576z"/></g></svg>
+      <h1 style="font-weight: 1000; margin-bottom: 8px;margin-top:8px">
+        Stable Diffusion Pipeline UI
+      </h1>
+    </div>
+    <p style="margin-bottom: 4px; font-size: 100%; line-height: 24px;">
+      Stable Diffusion WebUI with first class support for HuggingFace Diffusers Pipelines and Diffusion Schedulers, made in the style of <a style="text-decoration: underline;" href="https://github.com/AUTOMATIC1111/stable-diffusion-webui">Automatic1111's WebUI</a> and <a style="text-decoration: underline;" href="https://huggingface.co/spaces/Evel/Evel_Space">Evel_Space</a>.
+    </p>
+    <p> Supports Text-to-Image, Image to Image, and Inpainting modes, with fast switching between pipeline modes by reusing loaded model weights already in memory.
+    </p>
+  </div>

html/style.css ADDED Viewed

	@@ -0,0 +1,38 @@

+#image_upload{min-height: 512px}
+#image_upload [data-testid="image"], #image_upload [data-testid="image"] > div{min-height: 512px}
+#image_upload .touch-none{display: flex}
+#generate-button {
+    color:white;
+    border-color: orangered;
+    background: orange;
+    height: 45px;
+}
+.footer {
+    margin-bottom: 45px;
+    margin-top: 35px;
+    text-align: center;
+    border-bottom: 1px solid #e5e5e5;
+}
+.footer>p {
+    font-size: .8rem;
+    display: inline-block;
+    padding: 0 10px;
+    transform: translateY(10px);
+    background: white;
+}
+.dark .footer {
+    border-color: #303030;
+}
+.dark .footer>p {
+    background: #0b0f19;
+}
+.acknowledgments h4{
+    margin: 1.25em 0 .25em 0;
+    font-weight: bold;
+    font-size: 115%;
+}

model_ids.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+andite/anything-v4.0
+hakurei/waifu-diffusion
+prompthero/openjourney-v2
+runwayml/stable-diffusion-v1-5
+johnslegers/epic-diffusion
+stabilityai/stable-diffusion-2-1

requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+accelerate==0.15.0
+datasets==2.3.2
+diffusers==0.11.1
+gradio==3.16.2
+huggingface_hub==0.11.1
+numpy==1.23.3
+packaging==23.0
+Pillow==9.4.0
+torch
+torchvision
+tqdm==4.64.0
+transformers==4.25.1

test.ipynb ADDED Viewed

	@@ -0,0 +1,73 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open('model_ids.txt', 'r') as fp:\n",
+    "    model_ids = fp.read().splitlines() "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['andite/anything-v4.0',\n",
+       " 'hakurei/waifu-diffusion',\n",
+       " 'prompthero/openjourney-v2',\n",
+       " 'runwayml/stable-diffusion-v1-5',\n",
+       " 'johnslegers/epic-diffusion',\n",
+       " 'stabilityai/stable-diffusion-2-1']"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "model_ids"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "ml",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.8"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "cbbcdde725e9a65f1cb734ac4223fed46e03daf1eb62d8ccb3c48face3871521"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

utils/__init__.py ADDED Viewed

File without changes

utils/functions.py ADDED Viewed

	@@ -0,0 +1,273 @@

+import gradio as gr
+import torch
+import random
+from PIL import Image
+import os
+import argparse
+import shutil
+import gc
+import importlib
+import json
+from diffusers import (
+    StableDiffusionPipeline,
+    StableDiffusionImg2ImgPipeline,
+)
+from .inpaint_pipeline import SDInpaintPipeline as StableDiffusionInpaintPipelineLegacy
+from .textual_inversion import main as run_textual_inversion
+from .shared import default_scheduler, scheduler_dict, model_ids
+_xformers_available = importlib.util.find_spec("xformers") is not None
+device = "cuda" if torch.cuda.is_available() else "cpu"
+# device = 'cpu'
+dtype = torch.float16 if device == "cuda" else torch.float32
+low_vram_mode = False
+tab_to_pipeline = {
+    1: StableDiffusionPipeline,
+    2: StableDiffusionImg2ImgPipeline,
+    3: StableDiffusionInpaintPipelineLegacy,
+}
+def load_pipe(model_id, scheduler_name, tab_index=1, pipe_kwargs="{}"):
+    global pipe, loaded_model_id
+    scheduler = scheduler_dict[scheduler_name]
+    pipe_class = tab_to_pipeline[tab_index]
+    # load new weights from disk only when changing model_id
+    if model_id != loaded_model_id:
+        pipe = pipe_class.from_pretrained(
+            model_id,
+            torch_dtype=dtype,
+            safety_checker=None,
+            requires_safety_checker=False,
+            scheduler=scheduler.from_pretrained(model_id, subfolder="scheduler"),
+            **json.loads(pipe_kwargs),
+        )
+        loaded_model_id = model_id
+    # if same model_id, instantiate new pipeline with same underlying pytorch objects to avoid reloading weights from disk
+    elif pipe_class != pipe.__class__ or not isinstance(pipe.scheduler, scheduler):
+        pipe.components["scheduler"] = scheduler.from_pretrained(
+            model_id, subfolder="scheduler"
+        )
+        pipe = pipe_class(**pipe.components)
+    if device == "cuda":
+        pipe = pipe.to(device)
+        if _xformers_available:
+            pipe.enable_xformers_memory_efficient_attention()
+            print("using xformers")
+        if low_vram_mode:
+            pipe.enable_attention_slicing()
+            print("using attention slicing to lower VRAM")
+    return pipe
+pipe = None
+loaded_model_id = ""
+pipe = load_pipe(model_ids[0], default_scheduler)
+def pad_image(image):
+    w, h = image.size
+    if w == h:
+        return image
+    elif w > h:
+        new_image = Image.new(image.mode, (w, w), (0, 0, 0))
+        new_image.paste(image, (0, (w - h) // 2))
+        return new_image
+    else:
+        new_image = Image.new(image.mode, (h, h), (0, 0, 0))
+        new_image.paste(image, ((h - w) // 2, 0))
+        return new_image
+@torch.no_grad()
+def generate(
+    model_name,
+    scheduler_name,
+    prompt,
+    guidance,
+    steps,
+    n_images=1,
+    width=512,
+    height=512,
+    seed=0,
+    image=None,
+    strength=0.5,
+    inpaint_image=None,
+    inpaint_strength=0.5,
+    inpaint_radio="",
+    neg_prompt="",
+    tab_index=1,
+    pipe_kwargs="{}",
+    progress=gr.Progress(track_tqdm=True),
+):
+    if seed == -1:
+        seed = random.randint(0, 2147483647)
+    generator = torch.Generator(device).manual_seed(seed)
+    pipe = load_pipe(
+        model_id=model_name,
+        scheduler_name=scheduler_name,
+        tab_index=tab_index,
+        pipe_kwargs=pipe_kwargs,
+    )
+    status_message = f"Prompt: '{prompt}' | Seed: {seed} | Guidance: {guidance} | Scheduler: {scheduler_name} | Steps: {steps}"
+    if tab_index == 1:
+        status_message = "Text to Image " + status_message
+        result = pipe(
+            prompt,
+            negative_prompt=neg_prompt,
+            num_images_per_prompt=n_images,
+            num_inference_steps=int(steps),
+            guidance_scale=guidance,
+            width=width,
+            height=height,
+            generator=generator,
+        )
+    elif tab_index == 2:
+        status_message = "Image to Image " + status_message
+        print(image.size)
+        image = image.resize((width, height))
+        print(image.size)
+        result = pipe(
+            prompt,
+            negative_prompt=neg_prompt,
+            num_images_per_prompt=n_images,
+            image=image,
+            num_inference_steps=int(steps),
+            strength=strength,
+            guidance_scale=guidance,
+            generator=generator,
+        )
+    elif tab_index == 3:
+        status_message = "Inpainting " + status_message
+        init_image = inpaint_image["image"].resize((width, height))
+        mask = inpaint_image["mask"].resize((width, height))
+        result = pipe(
+            prompt,
+            negative_prompt=neg_prompt,
+            num_images_per_prompt=n_images,
+            image=init_image,
+            mask_image=mask,
+            num_inference_steps=int(steps),
+            strength=inpaint_strength,
+            preserve_unmasked_image=(
+                inpaint_radio == "preserve non-masked portions of image"
+            ),
+            guidance_scale=guidance,
+            generator=generator,
+        )
+    else:
+        return None, f"Unhandled tab index: {tab_index}"
+    return result.images, status_message
+# based on lvkaokao/textual-inversion-training
+def train_textual_inversion(
+    model_name,
+    scheduler_name,
+    type_of_thing,
+    files,
+    concept_word,
+    init_word,
+    text_train_steps,
+    text_train_bsz,
+    text_learning_rate,
+    progress=gr.Progress(track_tqdm=True),
+):
+    if device == "cpu":
+        raise gr.Error("Textual inversion training not supported on CPU")
+    pipe = load_pipe(
+        model_id=model_name,
+        scheduler_name=scheduler_name,
+        tab_index=1,
+    )
+    pipe.disable_xformers_memory_efficient_attention()  # xformers handled by textual inversion script
+    concept_dir = "concept_images"
+    output_dir = "output_model"
+    training_resolution = 512
+    if os.path.exists(output_dir):
+        shutil.rmtree("output_model")
+    if os.path.exists(concept_dir):
+        shutil.rmtree("concept_images")
+    os.makedirs(concept_dir, exist_ok=True)
+    os.makedirs(output_dir, exist_ok=True)
+    gc.collect()
+    torch.cuda.empty_cache()
+    if concept_word == "" or concept_word == None:
+        raise gr.Error("You forgot to define your concept prompt")
+    for j, file_temp in enumerate(files):
+        file = Image.open(file_temp.name)
+        image = pad_image(file)
+        image = image.resize((training_resolution, training_resolution))
+        extension = file_temp.name.split(".")[1]
+        image = image.convert("RGB")
+        image.save(f"{concept_dir}/{j+1}.{extension}", quality=100)
+    args_general = argparse.Namespace(
+        train_data_dir=concept_dir,
+        learnable_property=type_of_thing,
+        placeholder_token=concept_word,
+        initializer_token=init_word,
+        resolution=training_resolution,
+        train_batch_size=text_train_bsz,
+        gradient_accumulation_steps=1,
+        gradient_checkpointing=True,
+        mixed_precision="fp16",
+        use_bf16=False,
+        max_train_steps=int(text_train_steps),
+        learning_rate=text_learning_rate,
+        scale_lr=True,
+        lr_scheduler="constant",
+        lr_warmup_steps=0,
+        output_dir=output_dir,
+    )
+    try:
+        final_result = run_textual_inversion(pipe, args_general)
+    except Exception as e:
+        raise gr.Error(e)
+    pipe.text_encoder = pipe.text_encoder.eval().to(device, dtype=dtype)
+    pipe.unet = pipe.unet.eval().to(device, dtype=dtype)
+    gc.collect()
+    torch.cuda.empty_cache()
+    return (
+        f"Finished training! Check the {output_dir} directory for saved model weights"
+    )

utils/inpaint_pipeline.py ADDED Viewed

	@@ -0,0 +1,288 @@

+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+from typing import Optional, Union, List, Callable
+import PIL
+import numpy as np
+from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_inpaint_legacy import (
+    preprocess_image,
+    deprecate,
+    StableDiffusionInpaintPipelineLegacy,
+    StableDiffusionPipelineOutput,
+    PIL_INTERPOLATION,
+)
+def preprocess_mask(mask, scale_factor=8):
+    mask = mask.convert("L")
+    w, h = mask.size
+    w, h = map(lambda x: x - x % 32, (w, h))  # resize to integer multiple of 32
+    # input_mask = mask.resize((w, h), resample=PIL_INTERPOLATION["nearest"])
+    input_mask = np.array(mask).astype(np.float32) / 255.0
+    input_mask = np.tile(input_mask, (3, 1, 1))
+    input_mask = input_mask[None].transpose(0, 1, 2, 3)  # add batch dimension
+    input_mask = 1 - input_mask  # repaint white, keep black
+    input_mask = torch.round(torch.from_numpy(input_mask))
+    mask = mask.resize(
+        (w // scale_factor, h // scale_factor), resample=PIL_INTERPOLATION["nearest"]
+    )
+    mask = np.array(mask).astype(np.float32) / 255.0
+    mask = np.tile(mask, (4, 1, 1))
+    mask = mask[None].transpose(0, 1, 2, 3)  # add batch dimension
+    mask = 1 - mask  # repaint white, keep black
+    mask = torch.round(torch.from_numpy(mask))
+    return mask, input_mask
+class SDInpaintPipeline(StableDiffusionInpaintPipelineLegacy):
+    # forward call is same as StableDiffusionInpaintPipelineLegacy, but with line added to avoid noise added to final latents right before decoding step
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        image: Union[torch.FloatTensor, PIL.Image.Image] = None,
+        mask_image: Union[torch.FloatTensor, PIL.Image.Image] = None,
+        strength: float = 0.8,
+        num_inference_steps: Optional[int] = 50,
+        guidance_scale: Optional[float] = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        add_predicted_noise: Optional[bool] = False,
+        eta: Optional[float] = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        preserve_unmasked_image: bool = True,
+        **kwargs,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            image (`torch.FloatTensor` or `PIL.Image.Image`):
+                `Image`, or tensor representing an image batch, that will be used as the starting point for the
+                process. This is the image whose masked region will be inpainted.
+            mask_image (`torch.FloatTensor` or `PIL.Image.Image`):
+                `Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be
+                replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a
+                PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should
+                contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`.
+            strength (`float`, *optional*, defaults to 0.8):
+                Conceptually, indicates how much to inpaint the masked area. Must be between 0 and 1. When `strength`
+                is 1, the denoising process will be run on the masked area for the full number of iterations specified
+                in `num_inference_steps`. `image` will be used as a reference for the masked area, adding more noise to
+                that region the larger the `strength`. If `strength` is 0, no inpainting will occur.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The reference number of denoising steps. More denoising steps usually lead to a higher quality image at
+                the expense of slower inference. This parameter will be modulated by `strength`, as explained above.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            add_predicted_noise (`bool`, *optional*, defaults to True):
+                Use predicted noise instead of random noise when constructing noisy versions of the original image in
+                the reverse diffusion process
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+            preserve_unmasked_image (`bool`, *optional*, defaults to `True`):
+                Whether or not to preserve the unmasked portions of the original image in the inpainted output. If False,
+                inpainting of the masked latents may produce noticeable distortion of unmasked portions of the decoded
+                image.
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+        message = "Please use `image` instead of `init_image`."
+        init_image = deprecate("init_image", "0.13.0", message, take_from=kwargs)
+        image = init_image or image
+        # 1. Check inputs
+        self.check_inputs(prompt, strength, callback_steps)
+        # 2. Define call parameters
+        batch_size = 1 if isinstance(prompt, str) else len(prompt)
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+        # 3. Encode input prompt
+        text_embeddings = self._encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt,
+        )
+        # 4. Preprocess image and mask
+        if not isinstance(image, torch.FloatTensor):
+            image = preprocess_image(image)
+        # get mask corresponding to input latents as well as image
+        if not isinstance(mask_image, torch.FloatTensor):
+            mask_image, input_mask_image = preprocess_mask(
+                mask_image, self.vae_scale_factor
+            )
+        # 5. set timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps, num_inference_steps = self.get_timesteps(
+            num_inference_steps, strength, device
+        )
+        latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
+        # 6. Prepare latent variables
+        # encode the init image into latents and scale the latents
+        latents, init_latents_orig, noise = self.prepare_latents(
+            image,
+            latent_timestep,
+            batch_size,
+            num_images_per_prompt,
+            text_embeddings.dtype,
+            device,
+            generator,
+        )
+        # 7. Prepare mask latent
+        mask = mask_image.to(device=self.device, dtype=latents.dtype)
+        mask = torch.cat([mask] * batch_size * num_images_per_prompt)
+        # 8. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+        # 9. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = (
+                    torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                )
+                latent_model_input = self.scheduler.scale_model_input(
+                    latent_model_input, t
+                )
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input, t, encoder_hidden_states=text_embeddings
+                ).sample
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (
+                        noise_pred_text - noise_pred_uncond
+                    )
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(
+                    noise_pred, t, latents, **extra_step_kwargs
+                ).prev_sample
+                # masking
+                if add_predicted_noise:
+                    init_latents_proper = self.scheduler.add_noise(
+                        init_latents_orig, noise_pred_uncond, torch.tensor([t])
+                    )
+                else:
+                    init_latents_proper = self.scheduler.add_noise(
+                        init_latents_orig, noise, torch.tensor([t])
+                    )
+                latents = (init_latents_proper * mask) + (latents * (1 - mask))
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or (
+                    (i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0
+                ):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        callback(i, t, latents)
+        # use original latents corresponding to unmasked portions of the image
+        # necessary step because noise is still added to "init_latents_proper" after final denoising step
+        latents = (init_latents_orig * mask) + (latents * (1 - mask))
+        # 10. Post-processing
+        if preserve_unmasked_image:
+            # decode latents
+            latents = 1 / 0.18215 * latents
+            inpaint_image = self.vae.decode(latents).sample
+            # restore unmasked parts of image with original image
+            input_mask_image = input_mask_image.to(inpaint_image)
+            image = image.to(inpaint_image)
+            image = (image * input_mask_image) + (
+                inpaint_image * (1 - input_mask_image)
+            )  # use original unmasked portions of image to avoid degradation
+            # post-processing of image
+            image = (image / 2 + 0.5).clamp(0, 1)
+            # we always cast to float32 as this does not cause significant overhead and is compatible with bfloa16
+            image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        else:
+            image = self.decode_latents(latents)
+        # 11. Run safety checker
+        image, has_nsfw_concept = self.run_safety_checker(
+            image, device, text_embeddings.dtype
+        )
+        # 12. Convert to PIL
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+        if not return_dict:
+            return (image, has_nsfw_concept)
+        return StableDiffusionPipelineOutput(
+            images=image, nsfw_content_detected=has_nsfw_concept
+        )

utils/shared.py ADDED Viewed

	@@ -0,0 +1,16 @@

+import diffusers.schedulers
+# scheduler dict includes superclass SchedulerMixin (it still generates reasonable images)
+scheduler_dict = {
+    k: v
+    for k, v in diffusers.schedulers.__dict__.items()
+    if "Scheduler" in k and "Flax" not in k
+}
+scheduler_dict.pop(
+    "VQDiffusionScheduler"
+)  # requires unique parameter, unlike other schedulers
+scheduler_names = list(scheduler_dict.keys())
+default_scheduler = scheduler_names[3]  # expected to be DPM Multistep
+with open("model_ids.txt", "r") as fp:
+    model_ids = fp.read().splitlines()

utils/textual_inversion.py ADDED Viewed

	@@ -0,0 +1,916 @@

+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+import argparse
+import logging
+import math
+import os
+import random
+from pathlib import Path
+from typing import Optional
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch.utils.data import Dataset
+import datasets
+import diffusers
+import PIL
+import transformers
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import set_seed
+from diffusers import (
+    AutoencoderKL,
+    DDPMScheduler,
+    StableDiffusionPipeline,
+    UNet2DConditionModel,
+)
+from diffusers.optimization import get_scheduler
+from diffusers.utils import check_min_version
+from diffusers.utils.import_utils import is_xformers_available
+from huggingface_hub import HfFolder, Repository, create_repo, whoami
+# TODO: remove and import from diffusers.utils when the new version of diffusers is released
+from packaging import version
+from PIL import Image
+from torchvision import transforms
+from tqdm.auto import tqdm
+from transformers import CLIPTextModel, CLIPTokenizer
+if version.parse(version.parse(PIL.__version__).base_version) >= version.parse("9.1.0"):
+    PIL_INTERPOLATION = {
+        "linear": PIL.Image.Resampling.BILINEAR,
+        "bilinear": PIL.Image.Resampling.BILINEAR,
+        "bicubic": PIL.Image.Resampling.BICUBIC,
+        "lanczos": PIL.Image.Resampling.LANCZOS,
+        "nearest": PIL.Image.Resampling.NEAREST,
+    }
+else:
+    PIL_INTERPOLATION = {
+        "linear": PIL.Image.LINEAR,
+        "bilinear": PIL.Image.BILINEAR,
+        "bicubic": PIL.Image.BICUBIC,
+        "lanczos": PIL.Image.LANCZOS,
+        "nearest": PIL.Image.NEAREST,
+    }
+# ------------------------------------------------------------------------------
+# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
+check_min_version("0.10.0.dev0")
+logger = get_logger(__name__)
+def save_progress(text_encoder, placeholder_token_id, accelerator, args, save_path):
+    logger.info("Saving embeddings")
+    learned_embeds = (
+        accelerator.unwrap_model(text_encoder)
+        .get_input_embeddings()
+        .weight[placeholder_token_id]
+    )
+    learned_embeds_dict = {args.placeholder_token: learned_embeds.detach().cpu()}
+    torch.save(learned_embeds_dict, save_path)
+def parse_args():
+    parser = argparse.ArgumentParser(description="Simple example of a training script.")
+    parser.add_argument(
+        "--save_steps",
+        type=int,
+        default=500,
+        help="Save learned_embeds.bin every X updates steps.",
+    )
+    parser.add_argument(
+        "--only_save_embeds",
+        action="store_true",
+        default=False,
+        help="Save only the embeddings for the new concept.",
+    )
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default=None,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--revision",
+        type=str,
+        default=None,
+        help="Revision of pretrained model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--tokenizer_name",
+        type=str,
+        default=None,
+        help="Pretrained tokenizer name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--train_data_dir",
+        type=str,
+        default=None,
+        help="A folder containing the training data.",
+    )
+    parser.add_argument(
+        "--placeholder_token",
+        type=str,
+        default=None,
+        help="A token to use as a placeholder for the concept.",
+    )
+    parser.add_argument(
+        "--initializer_token",
+        type=str,
+        default=None,
+        help="A token to use as initializer word.",
+    )
+    parser.add_argument(
+        "--learnable_property",
+        type=str,
+        default="object",
+        help="Choose between 'object' and 'style'",
+    )
+    parser.add_argument(
+        "--repeats",
+        type=int,
+        default=100,
+        help="How many times to repeat the training data.",
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="text-inversion-model",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument(
+        "--seed", type=int, default=None, help="A seed for reproducible training."
+    )
+    parser.add_argument(
+        "--resolution",
+        type=int,
+        default=512,
+        help=(
+            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
+            " resolution"
+        ),
+    )
+    parser.add_argument(
+        "--center_crop",
+        action="store_true",
+        help="Whether to center crop images before resizing to resolution",
+    )
+    parser.add_argument(
+        "--train_batch_size",
+        type=int,
+        default=16,
+        help="Batch size (per device) for the training dataloader.",
+    )
+    parser.add_argument("--num_train_epochs", type=int, default=100)
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=5000,
+        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--gradient_checkpointing",
+        action="store_true",
+        help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=1e-4,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument(
+        "--scale_lr",
+        action="store_true",
+        default=False,
+        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="constant",
+        help=(
+            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
+    parser.add_argument(
+        "--lr_warmup_steps",
+        type=int,
+        default=500,
+        help="Number of steps for the warmup in the lr scheduler.",
+    )
+    parser.add_argument(
+        "--adam_beta1",
+        type=float,
+        default=0.9,
+        help="The beta1 parameter for the Adam optimizer.",
+    )
+    parser.add_argument(
+        "--adam_beta2",
+        type=float,
+        default=0.999,
+        help="The beta2 parameter for the Adam optimizer.",
+    )
+    parser.add_argument(
+        "--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use."
+    )
+    parser.add_argument(
+        "--adam_epsilon",
+        type=float,
+        default=1e-08,
+        help="Epsilon value for the Adam optimizer",
+    )
+    parser.add_argument(
+        "--push_to_hub",
+        action="store_true",
+        help="Whether or not to push the model to the Hub.",
+    )
+    parser.add_argument(
+        "--hub_token",
+        type=str,
+        default=None,
+        help="The token to use to push to the Model Hub.",
+    )
+    parser.add_argument(
+        "--hub_model_id",
+        type=str,
+        default=None,
+        help="The name of the repository to keep in sync with the local `output_dir`.",
+    )
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help=(
+            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+        ),
+    )
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default="no",
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose"
+            "between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10."
+            "and an Nvidia Ampere GPU."
+        ),
+    )
+    parser.add_argument(
+        "--allow_tf32",
+        action="store_true",
+        help=(
+            "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see"
+            " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices"
+        ),
+    )
+    parser.add_argument(
+        "--report_to",
+        type=str,
+        default="tensorboard",
+        help=(
+            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
+            ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
+        ),
+    )
+    parser.add_argument(
+        "--local_rank",
+        type=int,
+        default=-1,
+        help="For distributed training: local_rank",
+    )
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=int,
+        default=500,
+        help=(
+            "Save a checkpoint of the training state every X updates. These checkpoints are only suitable for resuming"
+            " training using `--resume_from_checkpoint`."
+        ),
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help=(
+            "Whether training should be resumed from a previous checkpoint. Use a path saved by"
+            ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
+        ),
+    )
+    parser.add_argument(
+        "--enable_xformers_memory_efficient_attention",
+        action="store_true",
+        help="Whether or not to use xformers.",
+    )
+    args = parser.parse_args()
+    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
+    if env_local_rank != -1 and env_local_rank != args.local_rank:
+        args.local_rank = env_local_rank
+    # if args.train_data_dir is None:
+    #    raise ValueError("You must specify a train data directory.")
+    return args
+imagenet_templates_small = [
+    "a photo of a {}",
+    "a rendering of a {}",
+    "a cropped photo of the {}",
+    "the photo of a {}",
+    "a photo of a clean {}",
+    "a photo of a dirty {}",
+    "a dark photo of the {}",
+    "a photo of my {}",
+    "a photo of the cool {}",
+    "a close-up photo of a {}",
+    "a bright photo of the {}",
+    "a cropped photo of a {}",
+    "a photo of the {}",
+    "a good photo of the {}",
+    "a photo of one {}",
+    "a close-up photo of the {}",
+    "a rendition of the {}",
+    "a photo of the clean {}",
+    "a rendition of a {}",
+    "a photo of a nice {}",
+    "a good photo of a {}",
+    "a photo of the nice {}",
+    "a photo of the small {}",
+    "a photo of the weird {}",
+    "a photo of the large {}",
+    "a photo of a cool {}",
+    "a photo of a small {}",
+]
+imagenet_style_templates_small = [
+    "a painting in the style of {}",
+    "a rendering in the style of {}",
+    "a cropped painting in the style of {}",
+    "the painting in the style of {}",
+    "a clean painting in the style of {}",
+    "a dirty painting in the style of {}",
+    "a dark painting in the style of {}",
+    "a picture in the style of {}",
+    "a cool painting in the style of {}",
+    "a close-up painting in the style of {}",
+    "a bright painting in the style of {}",
+    "a cropped painting in the style of {}",
+    "a good painting in the style of {}",
+    "a close-up painting in the style of {}",
+    "a rendition in the style of {}",
+    "a nice painting in the style of {}",
+    "a small painting in the style of {}",
+    "a weird painting in the style of {}",
+    "a large painting in the style of {}",
+]
+class TextualInversionDataset(Dataset):
+    def __init__(
+        self,
+        data_root,
+        tokenizer,
+        learnable_property="object",  # [object, style]
+        size=512,
+        repeats=100,
+        interpolation="bicubic",
+        flip_p=0.5,
+        set="train",
+        placeholder_token="*",
+        center_crop=False,
+    ):
+        self.data_root = data_root
+        self.tokenizer = tokenizer
+        self.learnable_property = learnable_property
+        self.size = size
+        self.placeholder_token = placeholder_token
+        self.center_crop = center_crop
+        self.flip_p = flip_p
+        self.image_paths = [
+            os.path.join(self.data_root, file_path)
+            for file_path in os.listdir(self.data_root)
+        ]
+        self.num_images = len(self.image_paths)
+        self._length = self.num_images
+        if set == "train":
+            self._length = self.num_images * repeats
+        self.interpolation = {
+            "linear": PIL_INTERPOLATION["linear"],
+            "bilinear": PIL_INTERPOLATION["bilinear"],
+            "bicubic": PIL_INTERPOLATION["bicubic"],
+            "lanczos": PIL_INTERPOLATION["lanczos"],
+        }[interpolation]
+        self.templates = (
+            imagenet_style_templates_small
+            if learnable_property == "style"
+            else imagenet_templates_small
+        )
+        self.flip_transform = transforms.RandomHorizontalFlip(p=self.flip_p)
+    def __len__(self):
+        return self._length
+    def __getitem__(self, i):
+        example = {}
+        image = Image.open(self.image_paths[i % self.num_images])
+        if not image.mode == "RGB":
+            image = image.convert("RGB")
+        placeholder_string = self.placeholder_token
+        text = random.choice(self.templates).format(placeholder_string)
+        example["input_ids"] = self.tokenizer(
+            text,
+            padding="max_length",
+            truncation=True,
+            max_length=self.tokenizer.model_max_length,
+            return_tensors="pt",
+        ).input_ids[0]
+        # default to score-sde preprocessing
+        img = np.array(image).astype(np.uint8)
+        if self.center_crop:
+            crop = min(img.shape[0], img.shape[1])
+            (h, w,) = (
+                img.shape[0],
+                img.shape[1],
+            )
+            img = img[
+                (h - crop) // 2 : (h + crop) // 2, (w - crop) // 2 : (w + crop) // 2
+            ]
+        image = Image.fromarray(img)
+        image = image.resize((self.size, self.size), resample=self.interpolation)
+        image = self.flip_transform(image)
+        image = np.array(image).astype(np.uint8)
+        image = (image / 127.5 - 1.0).astype(np.float32)
+        example["pixel_values"] = torch.from_numpy(image).permute(2, 0, 1)
+        return example
+def get_full_repo_name(
+    model_id: str, organization: Optional[str] = None, token: Optional[str] = None
+):
+    if token is None:
+        token = HfFolder.get_token()
+    if organization is None:
+        username = whoami(token)["name"]
+        return f"{username}/{model_id}"
+    else:
+        return f"{organization}/{model_id}"
+def main(pipe, args_imported):
+    args = parse_args()
+    vars(args).update(vars(args_imported))
+    print(args)
+    logging_dir = os.path.join(args.output_dir, args.logging_dir)
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        mixed_precision=args.mixed_precision,
+        log_with=args.report_to,
+        logging_dir=logging_dir,
+    )
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state, main_process_only=False)
+    if accelerator.is_local_main_process:
+        datasets.utils.logging.set_verbosity_warning()
+        transformers.utils.logging.set_verbosity_warning()
+        diffusers.utils.logging.set_verbosity_info()
+    else:
+        datasets.utils.logging.set_verbosity_error()
+        transformers.utils.logging.set_verbosity_error()
+        diffusers.utils.logging.set_verbosity_error()
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+    # Handle the repository creation
+    if accelerator.is_main_process:
+        if args.push_to_hub:
+            if args.hub_model_id is None:
+                repo_name = get_full_repo_name(
+                    Path(args.output_dir).name, token=args.hub_token
+                )
+            else:
+                repo_name = args.hub_model_id
+            create_repo(repo_name, exist_ok=True, token=args.hub_token)
+            repo = Repository(
+                args.output_dir, clone_from=repo_name, token=args.hub_token
+            )
+            with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
+                if "step_*" not in gitignore:
+                    gitignore.write("step_*\n")
+                if "epoch_*" not in gitignore:
+                    gitignore.write("epoch_*\n")
+        elif args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+    # Load tokenizer
+    tokenizer = pipe.tokenizer
+    # Load scheduler and models
+    noise_scheduler = pipe.scheduler
+    text_encoder = pipe.text_encoder
+    vae = pipe.vae
+    unet = pipe.unet
+    # Add the placeholder token in tokenizer
+    num_added_tokens = tokenizer.add_tokens(args.placeholder_token)
+    if num_added_tokens == 0:
+        raise ValueError(
+            f"The tokenizer already contains the token {args.placeholder_token}. Please pass a different"
+            " `placeholder_token` that is not already in the tokenizer."
+        )
+    # Convert the initializer_token, placeholder_token to ids
+    token_ids = tokenizer.encode(args.initializer_token, add_special_tokens=False)
+    # Check if initializer_token is a single token or a sequence of tokens
+    if len(token_ids) > 1:
+        raise ValueError("The initializer token must be a single token.")
+    initializer_token_id = token_ids[0]
+    placeholder_token_id = tokenizer.convert_tokens_to_ids(args.placeholder_token)
+    # Resize the token embeddings as we are adding new special tokens to the tokenizer
+    text_encoder.resize_token_embeddings(len(tokenizer))
+    # Initialise the newly added placeholder token with the embeddings of the initializer token
+    token_embeds = text_encoder.get_input_embeddings().weight.data
+    token_embeds[placeholder_token_id] = token_embeds[initializer_token_id]
+    # Freeze vae and unet
+    vae.requires_grad_(False)
+    unet.requires_grad_(False)
+    # Freeze all parameters except for the token embeddings in text encoder
+    text_encoder.text_model.encoder.requires_grad_(False)
+    text_encoder.text_model.final_layer_norm.requires_grad_(False)
+    text_encoder.text_model.embeddings.position_embedding.requires_grad_(False)
+    if args.gradient_checkpointing:
+        # Keep unet in train mode if we are using gradient checkpointing to save memory.
+        # The dropout cannot be != 0 so it doesn't matter if we are in eval or train mode.
+        unet.train()
+        text_encoder.gradient_checkpointing_enable()
+        unet.enable_gradient_checkpointing()
+    if args.enable_xformers_memory_efficient_attention:
+        if is_xformers_available():
+            unet.enable_xformers_memory_efficient_attention()
+        else:
+            raise ValueError(
+                "xformers is not available. Make sure it is installed correctly"
+            )
+    # Enable TF32 for faster training on Ampere GPUs,
+    # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
+    if args.allow_tf32:
+        torch.backends.cuda.matmul.allow_tf32 = True
+    if args.scale_lr:
+        args.learning_rate = (
+            args.learning_rate
+            * args.gradient_accumulation_steps
+            * args.train_batch_size
+            * accelerator.num_processes
+        )
+    # Initialize the optimizer
+    optimizer = torch.optim.AdamW(
+        text_encoder.get_input_embeddings().parameters(),  # only optimize the embeddings
+        lr=args.learning_rate,
+        betas=(args.adam_beta1, args.adam_beta2),
+        weight_decay=args.adam_weight_decay,
+        eps=args.adam_epsilon,
+    )
+    # Dataset and DataLoaders creation:
+    train_dataset = TextualInversionDataset(
+        data_root=args.train_data_dir,
+        tokenizer=tokenizer,
+        size=args.resolution,
+        placeholder_token=args.placeholder_token,
+        repeats=args.repeats,
+        learnable_property=args.learnable_property,
+        center_crop=args.center_crop,
+        set="train",
+    )
+    train_dataloader = torch.utils.data.DataLoader(
+        train_dataset, batch_size=args.train_batch_size, shuffle=True
+    )
+    # Scheduler and math around the number of training steps.
+    overrode_max_train_steps = False
+    num_update_steps_per_epoch = math.ceil(
+        len(train_dataloader) / args.gradient_accumulation_steps
+    )
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        overrode_max_train_steps = True
+    lr_scheduler = get_scheduler(
+        args.lr_scheduler,
+        optimizer=optimizer,
+        num_warmup_steps=args.lr_warmup_steps * args.gradient_accumulation_steps,
+        num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
+    )
+    # Prepare everything with our `accelerator`.
+    text_encoder, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+        text_encoder, optimizer, train_dataloader, lr_scheduler
+    )
+    # For mixed precision training we cast the text_encoder and vae weights to half-precision
+    # as these models are only used for inference, keeping weights in full precision is not required.
+    weight_dtype = torch.float32
+    if accelerator.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+    elif accelerator.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+    # Move vae and unet to device and cast to weight_dtype
+    unet.to(accelerator.device, dtype=weight_dtype)
+    vae.to(accelerator.device, dtype=weight_dtype)
+    text_encoder.to(accelerator.device, dtype=torch.float32)
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = math.ceil(
+        len(train_dataloader) / args.gradient_accumulation_steps
+    )
+    if overrode_max_train_steps:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    # Afterwards we recalculate our number of training epochs
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+    # We need to initialize the trackers we use, and also store our configuration.
+    # The trackers initializes automatically on the main process.
+    if accelerator.is_main_process:
+        accelerator.init_trackers("textual_inversion", config=vars(args))
+    # Train!
+    total_batch_size = (
+        args.train_batch_size
+        * accelerator.num_processes
+        * args.gradient_accumulation_steps
+    )
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    logger.info(
+        f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}"
+    )
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+    global_step = 0
+    first_epoch = 0
+    # Potentially load in the weights and states from a previous save
+    if args.resume_from_checkpoint:
+        if args.resume_from_checkpoint != "latest":
+            path = os.path.basename(args.resume_from_checkpoint)
+        else:
+            # Get the most recent checkpoint
+            dirs = os.listdir(args.output_dir)
+            dirs = [d for d in dirs if d.startswith("checkpoint")]
+            dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
+            path = dirs[-1]
+        accelerator.print(f"Resuming from checkpoint {path}")
+        accelerator.load_state(os.path.join(args.output_dir, path))
+        global_step = int(path.split("-")[1])
+        resume_global_step = global_step * args.gradient_accumulation_steps
+        first_epoch = resume_global_step // num_update_steps_per_epoch
+        resume_step = resume_global_step % num_update_steps_per_epoch
+    # Only show the progress bar once on each machine.
+    progress_bar = tqdm(
+        range(global_step, args.max_train_steps),
+        disable=not accelerator.is_local_main_process,
+    )
+    progress_bar.set_description("Steps")
+    # keep original embeddings as reference
+    orig_embeds_params = (
+        accelerator.unwrap_model(text_encoder)
+        .get_input_embeddings()
+        .weight.data.clone()
+    )
+    for epoch in range(first_epoch, args.num_train_epochs):
+        text_encoder.train()
+        for step, batch in enumerate(train_dataloader):
+            # Skip steps until we reach the resumed step
+            if (
+                args.resume_from_checkpoint
+                and epoch == first_epoch
+                and step < resume_step
+            ):
+                if step % args.gradient_accumulation_steps == 0:
+                    progress_bar.update(1)
+                continue
+            with accelerator.accumulate(text_encoder):
+                # Convert images to latent space
+                latents = (
+                    vae.encode(batch["pixel_values"].to(dtype=weight_dtype))
+                    .latent_dist.sample()
+                    .detach()
+                )
+                latents = latents * 0.18215
+                # Sample noise that we'll add to the latents
+                noise = torch.randn_like(latents)
+                bsz = latents.shape[0]
+                # Sample a random timestep for each image
+                timesteps = torch.randint(
+                    0,
+                    noise_scheduler.config.num_train_timesteps,
+                    (bsz,),
+                    device=latents.device,
+                )
+                timesteps = timesteps.long()
+                # Add noise to the latents according to the noise magnitude at each timestep
+                # (this is the forward diffusion process)
+                noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
+                # Get the text embedding for conditioning
+                encoder_hidden_states = text_encoder(batch["input_ids"])[0].to(
+                    dtype=weight_dtype
+                )
+                # Predict the noise residual
+                model_pred = unet(
+                    noisy_latents, timesteps, encoder_hidden_states
+                ).sample
+                # Get the target for loss depending on the prediction type
+                if noise_scheduler.config.prediction_type == "epsilon":
+                    target = noise
+                elif noise_scheduler.config.prediction_type == "v_prediction":
+                    target = noise_scheduler.get_velocity(latents, noise, timesteps)
+                else:
+                    raise ValueError(
+                        f"Unknown prediction type {noise_scheduler.config.prediction_type}"
+                    )
+                loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
+                accelerator.backward(loss)
+                if accelerator.num_processes > 1:
+                    grads = text_encoder.module.get_input_embeddings().weight.grad
+                else:
+                    grads = text_encoder.get_input_embeddings().weight.grad
+                # Get the index for tokens that we want to zero the grads for
+                index_grads_to_zero = (
+                    torch.arange(len(tokenizer)) != placeholder_token_id
+                )
+                grads.data[index_grads_to_zero, :] = grads.data[
+                    index_grads_to_zero, :
+                ].fill_(0)
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad()
+                # Let's make sure we don't update any embedding weights besides the newly added token
+                index_no_updates = torch.arange(len(tokenizer)) != placeholder_token_id
+                with torch.no_grad():
+                    accelerator.unwrap_model(
+                        text_encoder
+                    ).get_input_embeddings().weight[
+                        index_no_updates
+                    ] = orig_embeds_params[
+                        index_no_updates
+                    ]
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                progress_bar.update(1)
+                global_step += 1
+                if global_step % args.save_steps == 0:
+                    save_path = os.path.join(
+                        args.output_dir, f"{args.placeholder_token}-{global_step}.bin"
+                    )
+                    save_progress(
+                        text_encoder, placeholder_token_id, accelerator, args, save_path
+                    )
+                if global_step % args.checkpointing_steps == 0:
+                    if accelerator.is_main_process:
+                        save_path = os.path.join(
+                            args.output_dir, f"checkpoint-{global_step}"
+                        )
+                        accelerator.save_state(save_path)
+                        logger.info(f"Saved state to {save_path}")
+            logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
+            progress_bar.set_postfix(**logs)
+            accelerator.log(logs, step=global_step)
+            if global_step >= args.max_train_steps:
+                break
+    # Create the pipeline using using the trained modules and save it.
+    accelerator.wait_for_everyone()
+    if accelerator.is_main_process:
+        if args.push_to_hub and args.only_save_embeds:
+            logger.warn(
+                "Enabling full model saving because --push_to_hub=True was specified."
+            )
+            save_full_model = True
+        else:
+            save_full_model = not args.only_save_embeds
+        if save_full_model:
+            pipe.save_pretrained(args.output_dir)
+        # Save the newly trained embeddings
+        save_path = os.path.join(args.output_dir, "learned_embeds.bin")
+        save_progress(text_encoder, placeholder_token_id, accelerator, args, save_path)
+        if args.push_to_hub:
+            repo.push_to_hub(
+                commit_message="End of training", blocking=False, auto_lfs_prune=True
+            )
+    accelerator.end_training()
+if __name__ == "__main__":
+    pipeline = StableDiffusionPipeline.from_pretrained(
+        "andite/anything-v4.0", torch_dtype=torch.float16
+    )
+    imported_args = argparse.Namespace(
+        train_data_dir="concept_images",
+        learnable_property="object",
+        placeholder_token="redeyegirl",
+        initializer_token="girl",
+        resolution=512,
+        train_batch_size=1,
+        gradient_accumulation_steps=1,
+        gradient_checkpointing=True,
+        mixed_precision="fp16",
+        use_bf16=False,
+        max_train_steps=1000,
+        learning_rate=5.0e-4,
+        scale_lr=False,
+        lr_scheduler="constant",
+        lr_warmup_steps=0,
+        output_dir="output_model",
+    )
+    main(pipeline, imported_args)