Upload 12 files

Browse files

Files changed (12) hide show

StableTuner_RunPod_Fix/captionBuddy.py +967 -0
StableTuner_RunPod_Fix/clip_segmentation.py +325 -0
StableTuner_RunPod_Fix/configuration_gui.py +0 -0
StableTuner_RunPod_Fix/convert_diffusers_to_sd_cli.py +22 -0
StableTuner_RunPod_Fix/converters.py +120 -0
StableTuner_RunPod_Fix/dataloaders_util.py +1331 -0
StableTuner_RunPod_Fix/discriminator.py +764 -0
StableTuner_RunPod_Fix/lion_pytorch.py +88 -0
StableTuner_RunPod_Fix/lora_utils.py +236 -0
StableTuner_RunPod_Fix/model_util.py +1543 -0
StableTuner_RunPod_Fix/trainer.py +1750 -0
StableTuner_RunPod_Fix/trainer_util.py +435 -0

StableTuner_RunPod_Fix/captionBuddy.py ADDED Viewed

	@@ -0,0 +1,967 @@

+import tkinter as tk
+from tkinter import ttk, Menu
+import os
+import subprocess
+from PIL import Image, ImageTk, ImageDraw
+import tkinter.filedialog as fd
+import json
+import sys
+import os
+import sys
+from torchvision import transforms
+from torchvision.transforms.functional import InterpolationMode
+import torch
+import subprocess
+import numpy as np
+import requests
+import random
+import customtkinter as ctk
+from customtkinter import ThemeManager
+from clip_segmentation import ClipSeg
+#main class
+ctk.set_appearance_mode("dark")
+ctk.set_default_color_theme("blue")
+class BatchMaskWindow(ctk.CTkToplevel):
+    def __init__(self, parent, path, *args, **kwargs):
+        ctk.CTkToplevel.__init__(self, parent, *args, **kwargs)
+        self.parent = parent
+        self.title("Batch process masks")
+        self.geometry("320x310")
+        self.resizable(False, False)
+        self.wait_visibility()
+        self.grab_set()
+        self.focus_set()
+        self.mode_var = tk.StringVar(self, "Create if absent")
+        self.modes = ["Replace all masks", "Create if absent", "Add to existing", "Subtract from existing"]
+        self.frame = ctk.CTkFrame(self, width=600, height=300)
+        self.frame.grid(row=0, column=0, sticky="nsew", padx=10, pady=10)
+        self.path_label = ctk.CTkLabel(self.frame, text="Folder", width=100)
+        self.path_label.grid(row=0, column=0, sticky="w",padx=5, pady=5)
+        self.path_entry = ctk.CTkEntry(self.frame, width=150)
+        self.path_entry.insert(0, path)
+        self.path_entry.grid(row=0, column=1, sticky="w", padx=5, pady=5)
+        self.path_button = ctk.CTkButton(self.frame, width=30, text="...", command=lambda: self.browse_for_path(self.path_entry))
+        self.path_button.grid(row=0, column=1, sticky="e", padx=5, pady=5)
+        self.prompt_label = ctk.CTkLabel(self.frame, text="Prompt", width=100)
+        self.prompt_label.grid(row=1, column=0, sticky="w",padx=5, pady=5)
+        self.prompt_entry = ctk.CTkEntry(self.frame, width=200)
+        self.prompt_entry.grid(row=1, column=1, sticky="w", padx=5, pady=5)
+        self.mode_label = ctk.CTkLabel(self.frame, text="Mode", width=100)
+        self.mode_label.grid(row=2, column=0, sticky="w", padx=5, pady=5)
+        self.mode_dropdown = ctk.CTkOptionMenu(self.frame, variable=self.mode_var, values=self.modes, dynamic_resizing=False, width=200)
+        self.mode_dropdown.grid(row=2, column=1, sticky="w", padx=5, pady=5)
+        self.threshold_label = ctk.CTkLabel(self.frame, text="Threshold", width=100)
+        self.threshold_label.grid(row=3, column=0, sticky="w", padx=5, pady=5)
+        self.threshold_entry = ctk.CTkEntry(self.frame, width=200, placeholder_text="0.0 - 1.0")
+        self.threshold_entry.insert(0, "0.3")
+        self.threshold_entry.grid(row=3, column=1, sticky="w", padx=5, pady=5)
+        self.smooth_label = ctk.CTkLabel(self.frame, text="Smooth", width=100)
+        self.smooth_label.grid(row=4, column=0, sticky="w", padx=5, pady=5)
+        self.smooth_entry = ctk.CTkEntry(self.frame, width=200, placeholder_text="5")
+        self.smooth_entry.insert(0, 5)
+        self.smooth_entry.grid(row=4, column=1, sticky="w", padx=5, pady=5)
+        self.expand_label = ctk.CTkLabel(self.frame, text="Expand", width=100)
+        self.expand_label.grid(row=5, column=0, sticky="w", padx=5, pady=5)
+        self.expand_entry = ctk.CTkEntry(self.frame, width=200, placeholder_text="10")
+        self.expand_entry.insert(0, 10)
+        self.expand_entry.grid(row=5, column=1, sticky="w", padx=5, pady=5)
+        self.progress_label = ctk.CTkLabel(self.frame, text="Progress: 0/0", width=100)
+        self.progress_label.grid(row=6, column=0, sticky="w", padx=5, pady=5)
+        self.progress = ctk.CTkProgressBar(self.frame, orientation="horizontal", mode="determinate", width=200)
+        self.progress.grid(row=6, column=1, sticky="w", padx=5, pady=5)
+        self.create_masks_button = ctk.CTkButton(self.frame, text="Create Masks", width=310, command=self.create_masks)
+        self.create_masks_button.grid(row=7, column=0, columnspan=2, sticky="w", padx=5, pady=5)
+        self.frame.pack(fill="both", expand=True)
+    def browse_for_path(self, entry_box):
+        # get the path from the user
+        path = fd.askdirectory()
+        # set the path to the entry box
+        # delete entry box text
+        entry_box.focus_set()
+        entry_box.delete(0, tk.END)
+        entry_box.insert(0, path)
+        self.focus_set()
+    def set_progress(self, value, max_value):
+        progress = value / max_value
+        self.progress.set(progress)
+        self.progress_label.configure(text="{0}/{1}".format(value, max_value))
+        self.progress.update()
+    def create_masks(self):
+        self.parent.load_clip_seg_model()
+        mode = {
+            "Replace all masks": "replace",
+            "Create if absent": "fill",
+            "Add to existing": "add",
+            "Subtract from existing": "subtract"
+        }[self.mode_var.get()]
+        self.parent.clip_seg.mask_folder(
+            sample_dir=self.path_entry.get(),
+            prompts=[self.prompt_entry.get()],
+            mode=mode,
+            threshold=float(self.threshold_entry.get()),
+            smooth_pixels=int(self.smooth_entry.get()),
+            expand_pixels=int(self.expand_entry.get()),
+            progress_callback=self.set_progress,
+        )
+        self.parent.load_image()
+def _check_file_type(f: str) -> bool:
+    return f.lower().endswith(('.jpg', '.jpeg', '.png', '.webp', ".bmp", ".tiff"))
+class ImageBrowser(ctk.CTkToplevel):
+    def __init__(self,mainProcess=None):
+        super().__init__()
+        if not os.path.exists("scripts/BLIP"):
+            print("Getting BLIP from GitHub.")
+            subprocess.run(["git", "clone", "https://github.com/salesforce/BLIP", "scripts/BLIP"])
+        #if not os.path.exists("scripts/CLIP"):
+        #    print("Getting CLIP from GitHub.")
+        #    subprocess.run(["git", "clone", "https://github.com/pharmapsychotic/clip-interrogator.git', 'scripts/CLIP"])
+        blip_path = "scripts/BLIP"
+        sys.path.append(blip_path)
+        #clip_path = "scripts/CLIP"
+        #sys.path.append(clip_path)
+        self.mainProcess = mainProcess
+        self.captioner_folder = os.path.dirname(os.path.realpath(__file__))
+        self.clip_seg = None
+        self.PILimage = None
+        self.PILmask = None
+        self.mask_draw_x = 0
+        self.mask_draw_y = 0
+        self.mask_draw_radius = 20
+        #self = master
+        #self.overrideredirect(True)
+        #self.title_bar = TitleBar(self)
+        #self.title_bar.pack(side="top", fill="x")
+        #make not user resizable
+        self.title("Caption Buddy")
+        #self.resizable(False, False)
+        self.geometry("720x820")
+        self.top_frame = ctk.CTkFrame(self,fg_color='transparent')
+        self.top_frame.pack(side="top", fill="x",expand=False)
+        self.top_subframe = ctk.CTkFrame(self.top_frame,fg_color='transparent')
+        self.top_subframe.pack(side="bottom", fill="x",pady=10)
+        self.top_subframe.grid_columnconfigure(0, weight=1)
+        self.top_subframe.grid_columnconfigure(1, weight=1)
+        self.tip_frame = ctk.CTkFrame(self,fg_color='transparent')
+        self.tip_frame.pack(side="top")
+        self.dark_mode_var = "#202020"
+        #self.dark_purple_mode_var = "#1B0F1B"
+        self.dark_mode_title_var = "#286aff"
+        self.dark_mode_button_pressed_var = "#BB91B6"
+        self.dark_mode_button_var = "#8ea0e1"
+        self.dark_mode_text_var = "#c6c7c8"
+        #self.configure(bg_color=self.dark_mode_var)
+        self.canvas = ctk.CTkLabel(self,text='', width=600, height=600)
+        #self.canvas.configure(bg_color=self.dark_mode_var)
+        #create temporary image for canvas
+        self.canvas.pack()
+        self.cur_img_index = 0
+        self.image_count = 0
+        #make a frame with a grid under the canvas
+        self.frame = ctk.CTkFrame(self)
+        #grid
+        self.frame.grid_columnconfigure(0, weight=1)
+        self.frame.grid_columnconfigure(1, weight=100)
+        self.frame.grid_columnconfigure(2, weight=1)
+        self.frame.grid_rowconfigure(0, weight=1)
+        #show the frame
+        self.frame.pack(side="bottom", fill="x")
+        #bottom frame
+        self.bottom_frame = ctk.CTkFrame(self)
+        #make grid
+        self.bottom_frame.grid_columnconfigure(0, weight=0)
+        self.bottom_frame.grid_columnconfigure(1, weight=2)
+        self.bottom_frame.grid_columnconfigure(2, weight=0)
+        self.bottom_frame.grid_columnconfigure(3, weight=2)
+        self.bottom_frame.grid_columnconfigure(4, weight=0)
+        self.bottom_frame.grid_columnconfigure(5, weight=2)
+        self.bottom_frame.grid_rowconfigure(0, weight=1)
+        #show the frame
+        self.bottom_frame.pack(side="bottom", fill="x")
+        self.image_index = 0
+        self.image_list = []
+        self.caption = ''
+        self.caption_file = ''
+        self.caption_file_path = ''
+        self.caption_file_name = ''
+        self.caption_file_ext = ''
+        self.caption_file_name_no_ext = ''
+        self.output_format='text'
+        #check if bad_files.txt exists
+        if os.path.exists("bad_files.txt"):
+            #delete it
+            os.remove("bad_files.txt")
+        self.use_blip = True
+        self.debug = False
+        self.create_widgets()
+        self.load_blip_model()
+        self.load_options()
+        #self.open_folder()
+        self.canvas.focus_force()
+        self.canvas.bind("<Alt-Right>", self.next_image)
+        self.canvas.bind("<Alt-Left>", self.prev_image)
+        #on close window
+        self.protocol("WM_DELETE_WINDOW", self.on_closing)
+    def on_closing(self):
+        #self.save_options()
+        self.mainProcess.deiconify()
+        self.destroy()
+    def create_widgets(self):
+        self.output_folder = ''
+        # add a checkbox to toggle auto generate caption
+        self.auto_generate_caption = tk.BooleanVar(self.top_subframe)
+        self.auto_generate_caption.set(True)
+        self.auto_generate_caption_checkbox = ctk.CTkCheckBox(self.top_subframe, text="Auto Generate Caption", variable=self.auto_generate_caption,width=50)
+        self.auto_generate_caption_checkbox.pack(side="left", fill="x", expand=True, padx=10)
+        # add a checkbox to skip auto generating captions if they already exist
+        self.auto_generate_caption_text_override = tk.BooleanVar(self.top_subframe)
+        self.auto_generate_caption_text_override.set(False)
+        self.auto_generate_caption_checkbox_text_override = ctk.CTkCheckBox(self.top_subframe, text="Skip Auto Generate If Text Caption Exists", variable=self.auto_generate_caption_text_override,width=50)
+        self.auto_generate_caption_checkbox_text_override.pack(side="left", fill="x", expand=True, padx=10)
+        # add a checkbox to enable mask editing
+        self.enable_mask_editing = tk.BooleanVar(self.top_subframe)
+        self.enable_mask_editing.set(False)
+        self.enable_mask_editing_checkbox = ctk.CTkCheckBox(self.top_subframe, text="Enable Mask Editing", variable=self.enable_mask_editing, width=50)
+        self.enable_mask_editing_checkbox.pack(side="left", fill="x", expand=True, padx=10)
+        self.open_button = ctk.CTkButton(self.top_frame,text="Load Folder",fg_color=("gray75", "gray25"), command=self.open_folder,width=50)
+        #self.open_button.grid(row=0, column=1)
+        self.open_button.pack(side="left", fill="x",expand=True,padx=10)
+        #add a batch folder button
+        self.batch_folder_caption_button = ctk.CTkButton(self.top_frame, text="Batch Folder Caption", fg_color=("gray75", "gray25"), command=self.batch_folder_caption, width=50)
+        self.batch_folder_caption_button.pack(side="left", fill="x", expand=True, padx=10)
+        self.batch_folder_mask_button = ctk.CTkButton(self.top_frame, text="Batch Folder Mask", fg_color=("gray75", "gray25"), command=self.batch_folder_mask, width=50)
+        self.batch_folder_mask_button.pack(side="left", fill="x", expand=True, padx=10)
+        #add an options button to the same row as the open button
+        self.options_button = ctk.CTkButton(self.top_frame, text="Options",fg_color=("gray75", "gray25"), command=self.open_options,width=50)
+        self.options_button.pack(side="left", fill="x",expand=True,padx=10)
+        #add generate caption button
+        self.generate_caption_button = ctk.CTkButton(self.top_frame, text="Generate Caption",fg_color=("gray75", "gray25"), command=self.generate_caption,width=50)
+        self.generate_caption_button.pack(side="left", fill="x",expand=True,padx=10)
+        #add a label for tips under the buttons
+        self.tips_label = ctk.CTkLabel(self.tip_frame, text="Use Alt with left and right arrow keys to navigate images, enter to save the caption.")
+        self.tips_label.pack(side="top")
+        #add image count label
+        self.image_count_label = ctk.CTkLabel(self.tip_frame, text=f"Image {self.cur_img_index} of {self.image_count}")
+        self.image_count_label.pack(side="top")
+        self.image_label = ctk.CTkLabel(self.canvas,text='',width=100,height=100)
+        self.image_label.grid(row=0, column=0, sticky="nsew")
+        #self.image_label.bind("<Button-3>", self.click_canvas)
+        self.image_label.bind("<Motion>", self.draw_mask)
+        self.image_label.bind("<Button-1>", self.draw_mask)
+        self.image_label.bind("<Button-3>", self.draw_mask)
+        self.image_label.bind("<MouseWheel>", self.draw_mask_radius)
+        #self.image_label.pack(side="top")
+        #previous button
+        self.prev_button = ctk.CTkButton(self.frame,text="Previous", command= lambda event=None: self.prev_image(event),width=50)
+        #grid
+        self.prev_button.grid(row=1, column=0, sticky="w",padx=5,pady=10)
+        #self.prev_button.pack(side="left")
+        #self.prev_button.bind("<Left>", self.prev_image)
+        self.caption_entry = ctk.CTkEntry(self.frame)
+        #grid
+        self.caption_entry.grid(row=1, column=1, rowspan=3, sticky="nsew",pady=10)
+        #bind to enter key
+        self.caption_entry.bind("<Return>", self.save)
+        self.canvas.bind("<Return>", self.save)
+        self.caption_entry.bind("<Alt-Right>", self.next_image)
+        self.caption_entry.bind("<Alt-Left>", self.prev_image)
+        self.caption_entry.bind("<Control-BackSpace>", self.delete_word)
+        #next button
+        self.next_button = ctk.CTkButton(self.frame,text='Next', command= lambda event=None: self.next_image(event),width=50)
+        #self.next_button["text"] = "Next"
+        #grid
+        self.next_button.grid(row=1, column=2, sticky="e",padx=5,pady=10)
+        #add two entry boxes and labels in the style of :replace _ with _
+        #create replace string variable
+        self.replace_label = ctk.CTkLabel(self.bottom_frame, text="Replace:")
+        self.replace_label.grid(row=0, column=0, sticky="w",padx=5)
+        self.replace_entry = ctk.CTkEntry(self.bottom_frame,   )
+        self.replace_entry.grid(row=0, column=1, sticky="nsew",padx=5)
+        self.replace_entry.bind("<Return>", self.save)
+        #self.replace_entry.bind("<Tab>", self.replace)
+        #with label
+        #create with string variable
+        self.with_label = ctk.CTkLabel(self.bottom_frame, text="With:")
+        self.with_label.grid(row=0, column=2, sticky="w",padx=5)
+        self.with_entry = ctk.CTkEntry(self.bottom_frame,   )
+        self.with_entry.grid(row=0, column=3,  sticky="nswe",padx=5)
+        self.with_entry.bind("<Return>", self.save)
+        #add another entry with label, add suffix
+        #create prefix string var
+        self.prefix_label = ctk.CTkLabel(self.bottom_frame, text="Add to start:")
+        self.prefix_label.grid(row=0, column=4, sticky="w",padx=5)
+        self.prefix_entry = ctk.CTkEntry(self.bottom_frame,   )
+        self.prefix_entry.grid(row=0, column=5, sticky="nsew",padx=5)
+        self.prefix_entry.bind("<Return>", self.save)
+        #create suffix string var
+        self.suffix_label = ctk.CTkLabel(self.bottom_frame, text="Add to end:")
+        self.suffix_label.grid(row=0, column=6, sticky="w",padx=5)
+        self.suffix_entry = ctk.CTkEntry(self.bottom_frame,   )
+        self.suffix_entry.grid(row=0, column=7, sticky="nsew",padx=5)
+        self.suffix_entry.bind("<Return>", self.save)
+        self.all_entries = [self.replace_entry, self.with_entry, self.suffix_entry, self.caption_entry, self.prefix_entry]
+        #bind right click menu to all entries
+        for entry in self.all_entries:
+            entry.bind("<Button-3>", self.create_right_click_menu)
+    def batch_folder_caption(self):
+        #show imgs in folder askdirectory
+        #ask user if to batch current folder or select folder
+        #if bad_files.txt exists, delete it
+        self.bad_files = []
+        if os.path.exists('bad_files.txt'):
+            os.remove('bad_files.txt')
+        try:
+            #check if self.folder is set
+            self.folder
+        except AttributeError:
+            self.folder = ''
+        if self.folder == '':
+            self.folder = fd.askdirectory(title="Select Folder to Batch Process", initialdir=os.getcwd())
+            batch_input_dir = self.folder
+        else:
+            ask = tk.messagebox.askquestion("Batch Folder", "Batch current folder?")
+            if ask == 'yes':
+                batch_input_dir = self.folder
+            else:
+                batch_input_dir = fd.askdirectory(title="Select Folder to Batch Process", initialdir=os.getcwd())
+        ask2 = tk.messagebox.askquestion("Batch Folder", "Save output to same directory?")
+        if ask2 == 'yes':
+            batch_output_dir = batch_input_dir
+        else:
+            batch_output_dir = fd.askdirectory(title="Select Folder to Save Batch Processed Images", initialdir=os.getcwd())
+        if batch_input_dir == '':
+            return
+        if batch_output_dir == '':
+            batch_output_dir = batch_input_dir
+        self.caption_file_name = os.path.basename(batch_input_dir)
+        self.image_list = []
+        for file in os.listdir(batch_input_dir):
+            if _check_file_type(file) and not file.endswith('-masklabel.png'):
+                self.image_list.append(os.path.join(batch_input_dir, file))
+        self.image_index = 0
+        #use progress bar class
+        #pba = tk.Tk()
+        #pba.title("Batch Processing")
+        #remove icon
+        #pba.wm_attributes('-toolwindow','True')
+        pb = ProgressbarWithCancel(max=len(self.image_list))
+        #pb.set_max(len(self.image_list))
+        pb.set_progress(0)
+        #if batch_output_dir doesn't exist, create it
+        if not os.path.exists(batch_output_dir):
+            os.makedirs(batch_output_dir)
+        for i in range(len(self.image_list)):
+            radnom_chance = random.randint(0,25)
+            if radnom_chance == 0:
+                pb.set_random_label()
+            if pb.is_cancelled():
+                pb.destroy()
+                return
+            self.image_index = i
+            #get float value of progress between 0 and 1 according to the image index and the total number of images
+            progress = i / len(self.image_list)
+            pb.set_progress(progress)
+            self.update()
+            try:
+                img = Image.open(self.image_list[i]).convert("RGB")
+            except:
+                self.bad_files.append(self.image_list[i])
+                #skip file
+                continue
+            tensor = transforms.Compose([
+                        transforms.Resize((self.blipSize, self.blipSize), interpolation=InterpolationMode.BICUBIC),
+                        transforms.ToTensor(),
+                        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
+                    ])
+            torch_image = tensor(img).unsqueeze(0).to(torch.device("cuda"))
+            if self.nucleus_sampling:
+                captions = self.blip_decoder.generate(torch_image, sample=True, top_p=self.q_factor)
+            else:
+                captions = self.blip_decoder.generate(torch_image, sample=False, num_beams=16, min_length=self.min_length, \
+                            max_length=48, repetition_penalty=self.q_factor)
+            caption = captions[0]
+            self.replace = self.replace_entry.get()
+            self.replace_with = self.with_entry.get()
+            self.suffix_var = self.suffix_entry.get()
+            self.prefix = self.prefix_entry.get()
+            #prepare the caption
+            if self.suffix_var.startswith(',') or self.suffix_var.startswith(' '):
+                self.suffix_var = self.suffix_var
+            else:
+                self.suffix_var = ' ' + self.suffix_var
+            caption = caption.replace(self.replace, self.replace_with)
+            if self.prefix != '':
+                if self.prefix.endswith(' '):
+                    self.prefix = self.prefix[:-1]
+                if not self.prefix.endswith(','):
+                    self.prefix = self.prefix+','
+                caption = self.prefix + ' ' + caption
+            if caption.endswith(',') or caption.endswith('.'):
+                caption = caption[:-1]
+                caption = caption +', ' + self.suffix_var
+            else:
+                caption = caption + self.suffix_var
+            #saving the captioned image
+            if self.output_format == 'text':
+                #text file with same name as image
+                imgName = os.path.basename(self.image_list[self.image_index])
+                imgName = imgName[:imgName.rfind('.')]
+                caption_file = os.path.join(batch_output_dir, imgName + '.txt')
+                with open(caption_file, 'w') as f:
+                    f.write(caption)
+            elif self.output_format == 'filename':
+                #duplicate image with caption as file name
+                img.save(os.path.join(batch_output_dir, caption+'.png'))
+            progress = i + 1 / len(self.image_list)
+            pb.set_progress(progress)
+        #show message box when done
+        pb.destroy()
+        donemsg = tk.messagebox.showinfo("Batch Folder", "Batching complete!",parent=self.master)
+        if len(self.bad_files) > 0:
+            bad_files_msg = tk.messagebox.showinfo("Bad Files", "Couldn't process " + str(len(self.bad_files)) + "files,\nFor a list of problematic files see bad_files.txt",parent=self.master)
+            with open('bad_files.txt', 'w') as f:
+                for item in self.bad_files:
+                    f.write(item + '\n')
+        #ask user if we should load the batch output folder
+        ask3 = tk.messagebox.askquestion("Batch Folder", "Load batch output folder?")
+        if ask3 == 'yes':
+            self.image_index = 0
+            self.open_folder(folder=batch_output_dir)
+        #focus on donemsg
+        #donemsg.focus_force()
+    def generate_caption(self):
+        #get the image
+        tensor = transforms.Compose([
+                        #transforms.CenterCrop(SIZE),
+                        transforms.Resize((self.blipSize, self.blipSize), interpolation=InterpolationMode.BICUBIC),
+                        transforms.ToTensor(),
+                        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
+                    ])
+        torch_image = tensor(self.PILimage).unsqueeze(0).to(torch.device("cuda"))
+        if self.nucleus_sampling:
+            captions = self.blip_decoder.generate(torch_image, sample=True, top_p=self.q_factor)
+        else:
+            captions = self.blip_decoder.generate(torch_image, sample=False, num_beams=16, min_length=self.min_length, \
+                        max_length=48, repetition_penalty=self.q_factor)
+        self.caption = captions[0]
+        self.caption_entry.delete(0, tk.END)
+        self.caption_entry.insert(0, self.caption)
+        #change the caption entry color to red
+        self.caption_entry.configure(fg_color='red')
+    def load_blip_model(self):
+        self.blipSize = 384
+        blip_model_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_caption_capfilt_large.pth'
+        #check if options file exists
+        if os.path.exists(os.path.join(self.captioner_folder, 'options.json')):
+            with open(os.path.join(self.captioner_folder, 'options.json'), 'r') as f:
+                self.nucleus_sampling = json.load(f)['nucleus_sampling']
+                self.q_factor = json.load(f)['q_factor']
+                self.min_length = json.load(f)['min_length']
+        else:
+            self.nucleus_sampling = False
+            self.q_factor = 1.0
+            self.min_length = 22
+        config_path = os.path.join(self.captioner_folder, "BLIP/configs/med_config.json")
+        cache_folder = os.path.join(self.captioner_folder, "BLIP/cache")
+        model_path = os.path.join(self.captioner_folder, "BLIP/models/model_base_caption_capfilt_large.pth")
+        if not os.path.exists(cache_folder):
+            os.makedirs(cache_folder)
+        if not os.path.exists(model_path):
+            print(f"Downloading BLIP to {cache_folder}")
+            with requests.get(blip_model_url, stream=True) as session:
+                session.raise_for_status()
+                with open(model_path, 'wb') as f:
+                    for chunk in session.iter_content(chunk_size=1024):
+                        f.write(chunk)
+            print('Download complete')
+        else:
+            print(f"Found BLIP model")
+        import models.blip
+        blip_decoder = models.blip.blip_decoder(pretrained=model_path, image_size=self.blipSize, vit='base', med_config=config_path)
+        blip_decoder.eval()
+        self.blip_decoder = blip_decoder.to(torch.device("cuda"))
+    def batch_folder_mask(self):
+        folder = ''
+        try:
+            # check if self.folder is set
+            folder = self.folder
+        except:
+            pass
+        dialog = BatchMaskWindow(self, folder)
+        dialog.mainloop()
+    def load_clip_seg_model(self):
+        if self.clip_seg is None:
+            self.clip_seg = ClipSeg()
+    def open_folder(self,folder=None):
+        if folder is None:
+            self.folder = fd.askdirectory()
+        else:
+            self.folder = folder
+        if self.folder == '':
+            return
+        self.output_folder = self.folder
+        self.image_list = [os.path.join(self.folder, f) for f in os.listdir(self.folder) if _check_file_type(f) and not f.endswith('-masklabel.png') and not f.endswith('-depth.png')]
+        #self.image_list.sort()
+        #sort the image list alphabetically so that the images are in the same order every time
+        self.image_list.sort(key=lambda x: x.lower())
+        self.image_count = len(self.image_list)
+        if self.image_count == 0:
+            tk.messagebox.showinfo("No Images", "No images found in the selected folder")
+            return
+        #update the image count label
+        self.image_index = 0
+        self.image_count_label.configure(text=f'Image {self.image_index+1} of {self.image_count}')
+        self.output_folder = self.folder
+        self.load_image()
+        self.caption_entry.focus_set()
+    def draw_mask(self, event):
+        if not self.enable_mask_editing.get():
+            return
+        if event.widget != self.image_label.children["!label"]:
+            return
+        start_x = int(event.x / self.image_size[0] * self.PILimage.width)
+        start_y = int(event.y / self.image_size[1] * self.PILimage.height)
+        end_x = int(self.mask_draw_x / self.image_size[0] * self.PILimage.width)
+        end_y = int(self.mask_draw_y / self.image_size[1] * self.PILimage.height)
+        self.mask_draw_x = event.x
+        self.mask_draw_y = event.y
+        color = None
+        if event.state & 0x0100 or event.num == 1:  # left mouse button
+            color = (255, 255, 255)
+        elif event.state & 0x0400 or event.num == 3:  # right mouse button
+            color = (0, 0, 0)
+        if color is not None:
+            if self.PILmask is None:
+                self.PILmask = Image.new('RGB', size=self.PILimage.size, color=(0, 0, 0))
+            draw = ImageDraw.Draw(self.PILmask)
+            draw.line((start_x, start_y, end_x, end_y), fill=color, width=self.mask_draw_radius + self.mask_draw_radius + 1)
+            draw.ellipse((start_x - self.mask_draw_radius, start_y - self.mask_draw_radius, start_x + self.mask_draw_radius, start_y + self.mask_draw_radius), fill=color, outline=None)
+            draw.ellipse((end_x - self.mask_draw_radius, end_y - self.mask_draw_radius, end_x + self.mask_draw_radius, end_y + self.mask_draw_radius), fill=color, outline=None)
+            self.compose_masked_image()
+            self.display_image()
+    def draw_mask_radius(self, event):
+        if event.widget != self.image_label.children["!label"]:
+            return
+        delta = -np.sign(event.delta) * 5
+        self.mask_draw_radius += delta
+    def compose_masked_image(self):
+        np_image = np.array(self.PILimage).astype(np.float32) / 255.0
+        np_mask = np.array(self.PILmask).astype(np.float32) / 255.0
+        np_mask = np.clip(np_mask, 0.4, 1.0)
+        np_masked_image = (np_image * np_mask * 255.0).astype(np.uint8)
+        self.image = Image.fromarray(np_masked_image, mode='RGB')
+    def display_image(self):
+        #resize to fit 600x600 while maintaining aspect ratio
+        width, height = self.image.size
+        if width > height:
+            new_width = 600
+            new_height = int(600 * height / width)
+        else:
+            new_height = 600
+            new_width = int(600 * width / height)
+        self.image_size = (new_width, new_height)
+        self.image = self.image.resize(self.image_size, Image.Resampling.LANCZOS)
+        self.image = ctk.CTkImage(self.image, size=self.image_size)
+        self.image_label.configure(image=self.image)
+    def load_image(self):
+        try:
+            self.PILimage = Image.open(self.image_list[self.image_index]).convert('RGB')
+        except:
+            print(f'Error opening image {self.image_list[self.image_index]}')
+            print('Logged path to bad_files.txt')
+            #if bad_files.txt doesn't exist, create it
+            if not os.path.exists('bad_files.txt'):
+                with open('bad_files.txt', 'w') as f:
+                    f.write(self.image_list[self.image_index]+'\n')
+            else:
+                with open('bad_files.txt', 'a') as f:
+                    f.write(self.image_list[self.image_index]+'\n')
+            return
+        self.image = self.PILimage.copy()
+        try:
+            self.PILmask = None
+            mask_filename = os.path.splitext(self.image_list[self.image_index])[0] + '-masklabel.png'
+            if os.path.exists(mask_filename):
+                self.PILmask = Image.open(mask_filename).convert('RGB')
+                self.compose_masked_image()
+        except Exception as e:
+            print(f'Error opening mask for {self.image_list[self.image_index]}')
+            print('Logged path to bad_files.txt')
+            #if bad_files.txt doesn't exist, create it
+            if not os.path.exists('bad_files.txt'):
+                with open('bad_files.txt', 'w') as f:
+                    f.write(self.image_list[self.image_index]+'\n')
+            else:
+                with open('bad_files.txt', 'a') as f:
+                    f.write(self.image_list[self.image_index]+'\n')
+            return
+        self.display_image()
+        self.caption_file_path = self.image_list[self.image_index]
+        self.caption_file_name = os.path.basename(self.caption_file_path)
+        self.caption_file_ext = os.path.splitext(self.caption_file_name)[1]
+        self.caption_file_name_no_ext = os.path.splitext(self.caption_file_name)[0]
+        self.caption_file = os.path.join(self.folder, self.caption_file_name_no_ext + '.txt')
+        if os.path.isfile(self.caption_file) and self.auto_generate_caption.get() == False or os.path.isfile(self.caption_file) and self.auto_generate_caption.get() == True and self.auto_generate_caption_text_override.get() == True:
+            with open(self.caption_file, 'r') as f:
+                self.caption = f.read()
+                self.caption_entry.delete(0, tk.END)
+                self.caption_entry.insert(0, self.caption)
+                self.caption_entry.configure(fg_color=ThemeManager.theme["CTkEntry"]["fg_color"])
+                self.use_blip = False
+        elif os.path.isfile(self.caption_file) and self.auto_generate_caption.get() == True and self.auto_generate_caption_text_override.get() == False or os.path.isfile(self.caption_file)==False and self.auto_generate_caption.get() == True and self.auto_generate_caption_text_override.get() == True:
+            self.use_blip = True
+            self.caption_entry.delete(0, tk.END)
+        elif os.path.isfile(self.caption_file) == False and self.auto_generate_caption.get() == False:
+            self.caption_entry.delete(0, tk.END)
+            return
+        if self.use_blip and self.debug==False:
+            tensor = transforms.Compose([
+                        transforms.Resize((self.blipSize, self.blipSize), interpolation=InterpolationMode.BICUBIC),
+                        transforms.ToTensor(),
+                        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
+                    ])
+            torch_image = tensor(self.PILimage).unsqueeze(0).to(torch.device("cuda"))
+            if self.nucleus_sampling:
+                captions = self.blip_decoder.generate(torch_image, sample=True, top_p=self.q_factor)
+            else:
+                captions = self.blip_decoder.generate(torch_image, sample=False, num_beams=16, min_length=self.min_length, \
+                            max_length=48, repetition_penalty=self.q_factor)
+            self.caption = captions[0]
+            self.caption_entry.delete(0, tk.END)
+            self.caption_entry.insert(0, self.caption)
+            #change the caption entry color to red
+            self.caption_entry.configure(fg_color='red')
+    def save(self, event):
+        self.save_caption()
+        if self.enable_mask_editing.get():
+            self.save_mask()
+    def save_mask(self):
+        mask_filename = os.path.splitext(self.image_list[self.image_index])[0] + '-masklabel.png'
+        if self.PILmask is not None:
+            self.PILmask.save(mask_filename)
+    def save_caption(self):
+        self.caption = self.caption_entry.get()
+        self.replace = self.replace_entry.get()
+        self.replace_with = self.with_entry.get()
+        self.suffix_var = self.suffix_entry.get()
+        self.prefix = self.prefix_entry.get()
+        #prepare the caption
+        self.caption = self.caption.replace(self.replace, self.replace_with)
+        if self.suffix_var.startswith(',') or self.suffix_var.startswith(' '):
+            self.suffix_var = self.suffix_var
+        else:
+            self.suffix_var = ' ' + self.suffix_var
+        if self.prefix != '':
+            if self.prefix.endswith(' '):
+                self.prefix = self.prefix[:-1]
+            if not self.prefix.endswith(','):
+                self.prefix = self.prefix+','
+            self.caption = self.prefix + ' ' + self.caption
+        if self.caption.endswith(',') or self.caption.endswith('.'):
+            self.caption = self.caption[:-1]
+            self.caption = self.caption +', ' + self.suffix_var
+        else:
+            self.caption = self.caption + self.suffix_var
+        self.caption = self.caption.strip()
+        if self.output_folder != self.folder:
+            outputFolder = self.output_folder
+        else:
+            outputFolder = self.folder
+        if self.output_format == 'text':
+            #text file with same name as image
+            #image name
+            #print('test')
+            imgName = os.path.basename(self.image_list[self.image_index])
+            imgName = imgName[:imgName.rfind('.')]
+            self.caption_file = os.path.join(outputFolder, imgName + '.txt')
+            with open(self.caption_file, 'w') as f:
+                f.write(self.caption)
+        elif self.output_format == 'filename':
+            #duplicate image with caption as file name
+            #make sure self.caption doesn't contain any illegal characters
+            illegal_chars = ['/', '\\', ':', '*', '?', '"', "'",'<', '>', '|', '.']
+            for char in illegal_chars:
+                self.caption = self.caption.replace(char, '')
+            self.PILimage.save(os.path.join(outputFolder, self.caption+'.png'))
+        self.caption_entry.delete(0, tk.END)
+        self.caption_entry.insert(0, self.caption)
+        self.caption_entry.configure(fg_color='green')
+        self.caption_entry.focus_force()
+    def delete_word(self,event):
+        ent = event.widget
+        end_idx = ent.index(tk.INSERT)
+        start_idx = ent.get().rfind(" ", None, end_idx)
+        ent.selection_range(start_idx, end_idx)
+    def prev_image(self, event):
+        if self.image_index > 0:
+            self.image_index -= 1
+            self.image_count_label.configure(text=f'Image {self.image_index+1} of {self.image_count}')
+            self.load_image()
+            self.caption_entry.focus_set()
+            self.caption_entry.focus_force()
+    def next_image(self, event):
+        if self.image_index < len(self.image_list) - 1:
+            self.image_index += 1
+            self.image_count_label.configure(text=f'Image {self.image_index+1} of {self.image_count}')
+            self.load_image()
+            self.caption_entry.focus_set()
+            self.caption_entry.focus_force()
+    def open_options(self):
+        self.options_window = ctk.CTkToplevel(self)
+        self.options_window.title("Options")
+        self.options_window.geometry("320x550")
+        #disable reszie
+        self.options_window.resizable(False, False)
+        self.options_window.focus_force()
+        self.options_window.grab_set()
+        self.options_window.transient(self)
+        self.options_window.protocol("WM_DELETE_WINDOW", self.close_options)
+        #add title label
+        self.options_title_label = ctk.CTkLabel(self.options_window, text="Options",font=ctk.CTkFont(size=20, weight="bold"))
+        self.options_title_label.pack(side="top", pady=5)
+        #add an entry with a button to select a folder as output folder
+        self.output_folder_label = ctk.CTkLabel(self.options_window, text="Output Folder")
+        self.output_folder_label.pack(side="top", pady=5)
+        self.output_folder_entry = ctk.CTkEntry(self.options_window)
+        self.output_folder_entry.pack(side="top", fill="x", expand=False,padx=15, pady=5)
+        self.output_folder_entry.insert(0, self.output_folder)
+        self.output_folder_button = ctk.CTkButton(self.options_window, text="Select Folder", command=self.select_output_folder,fg_color=("gray75", "gray25"))
+        self.output_folder_button.pack(side="top", pady=5)
+        #add radio buttons to select the output format between text and filename
+        self.output_format_label = ctk.CTkLabel(self.options_window, text="Output Format")
+        self.output_format_label.pack(side="top", pady=5)
+        self.output_format_var = tk.StringVar(self.options_window)
+        self.output_format_var.set(self.output_format)
+        self.output_format_text = ctk.CTkRadioButton(self.options_window, text="Text File", variable=self.output_format_var, value="text")
+        self.output_format_text.pack(side="top", pady=5)
+        self.output_format_filename = ctk.CTkRadioButton(self.options_window, text="File name", variable=self.output_format_var, value="filename")
+        self.output_format_filename.pack(side="top", pady=5)
+        #add BLIP settings section
+        self.blip_settings_label = ctk.CTkLabel(self.options_window, text="BLIP Settings",font=ctk.CTkFont(size=20, weight="bold"))
+        self.blip_settings_label.pack(side="top", pady=10)
+        #add a checkbox to use nucleas sampling or not
+        self.nucleus_sampling_var = tk.IntVar(self.options_window)
+        self.nucleus_sampling_checkbox = ctk.CTkCheckBox(self.options_window, text="Use nucleus sampling", variable=self.nucleus_sampling_var)
+        self.nucleus_sampling_checkbox.pack(side="top", pady=5)
+        if self.debug:
+            self.nucleus_sampling = 0
+            self.q_factor = 0.5
+            self.min_length = 10
+        self.nucleus_sampling_var.set(self.nucleus_sampling)
+        #add a float entry to set the q factor
+        self.q_factor_label = ctk.CTkLabel(self.options_window, text="Q Factor")
+        self.q_factor_label.pack(side="top", pady=5)
+        self.q_factor_entry = ctk.CTkEntry(self.options_window)
+        self.q_factor_entry.insert(0, self.q_factor)
+        self.q_factor_entry.pack(side="top", pady=5)
+        #add a int entry to set the number minimum length
+        self.min_length_label = ctk.CTkLabel(self.options_window, text="Minimum Length")
+        self.min_length_label.pack(side="top", pady=5)
+        self.min_length_entry = ctk.CTkEntry(self.options_window)
+        self.min_length_entry.insert(0, self.min_length)
+        self.min_length_entry.pack(side="top", pady=5)
+        #add a horozontal radio button to select between None, ViT-L-14/openai, ViT-H-14/laion2b_s32b_b79k
+        #self.model_label = ctk.CTkLabel(self.options_window, text="CLIP Interrogation")
+        #self.model_label.pack(side="top")
+        #self.model_var = tk.StringVar(self.options_window)
+        #self.model_var.set(self.model)
+        #self.model_none = tk.Radiobutton(self.options_window, text="None", variable=self.model_var, value="None")
+        #self.model_none.pack(side="top")
+        #self.model_vit_l_14 = tk.Radiobutton(self.options_window, text="ViT-L-14/openai", variable=self.model_var, value="ViT-L-14/openai")
+        #self.model_vit_l_14.pack(side="top")
+        #self.model_vit_h_14 = tk.Radiobutton(self.options_window, text="ViT-H-14/laion2b_s32b_b79k", variable=self.model_var, value="ViT-H-14/laion2b_s32b_b79k")
+        #self.model_vit_h_14.pack(side="top")
+        #add a save button
+        self.save_button = ctk.CTkButton(self.options_window, text="Save", command=self.save_options, fg_color=("gray75", "gray25"))
+        self.save_button.pack(side="top",fill='x',pady=10,padx=10)
+        #all entries list
+        entries = [self.output_folder_entry, self.q_factor_entry, self.min_length_entry]
+        #bind the right click to all entries
+        for entry in entries:
+            entry.bind("<Button-3>", self.create_right_click_menu)
+        self.options_file = os.path.join(self.captioner_folder, 'captioner_options.json')
+        if os.path.isfile(self.options_file):
+            with open(self.options_file, 'r') as f:
+                self.options = json.load(f)
+                self.output_folder_entry.delete(0, tk.END)
+                self.output_folder_entry.insert(0, self.output_folder)
+                self.output_format_var.set(self.options['output_format'])
+                self.nucleus_sampling_var.set(self.options['nucleus_sampling'])
+                self.q_factor_entry.delete(0, tk.END)
+                self.q_factor_entry.insert(0, self.options['q_factor'])
+                self.min_length_entry.delete(0, tk.END)
+                self.min_length_entry.insert(0, self.options['min_length'])
+    def load_options(self):
+        self.options_file = os.path.join(self.captioner_folder, 'captioner_options.json')
+        if os.path.isfile(self.options_file):
+            with open(self.options_file, 'r') as f:
+                self.options = json.load(f)
+                #self.output_folder = self.folder
+                #self.output_folder = self.options['output_folder']
+                if 'folder' in self.__dict__:
+                    self.output_folder = self.folder
+                else:
+                    self.output_folder = ''
+                self.output_format = self.options['output_format']
+                self.nucleus_sampling = self.options['nucleus_sampling']
+                self.q_factor = self.options['q_factor']
+                self.min_length = self.options['min_length']
+        else:
+            #if self has folder, use it, otherwise use the current folder
+            if 'folder' in self.__dict__ :
+                self.output_folder = self.folder
+            else:
+                self.output_folder = ''
+            self.output_format = "text"
+            self.nucleus_sampling = False
+            self.q_factor = 0.9
+            self.min_length =22
+    def save_options(self):
+        self.output_folder = self.output_folder_entry.get()
+        self.output_format = self.output_format_var.get()
+        self.nucleus_sampling = self.nucleus_sampling_var.get()
+        self.q_factor = float(self.q_factor_entry.get())
+        self.min_length = int(self.min_length_entry.get())
+        #save options to a file
+        self.options_file = os.path.join(self.captioner_folder, 'captioner_options.json')
+        with open(self.options_file, 'w') as f:
+            json.dump({'output_folder': self.output_folder, 'output_format': self.output_format, 'nucleus_sampling': self.nucleus_sampling, 'q_factor': self.q_factor, 'min_length': self.min_length}, f)
+        self.close_options()
+    def select_output_folder(self):
+        self.output_folder = fd.askdirectory()
+        self.output_folder_entry.delete(0, tk.END)
+        self.output_folder_entry.insert(0, self.output_folder)
+    def close_options(self):
+        self.options_window.destroy()
+        self.caption_entry.focus_force()
+    def create_right_click_menu(self, event):
+        #create a menu
+        self.menu = Menu(self, tearoff=0)
+        #add commands to the menu
+        self.menu.add_command(label="Cut", command=lambda: self.focus_get().event_generate("<<Cut>>"))
+        self.menu.add_command(label="Copy", command=lambda: self.focus_get().event_generate("<<Copy>>"))
+        self.menu.add_command(label="Paste", command=lambda: self.focus_get().event_generate("<<Paste>>"))
+        self.menu.add_command(label="Select All", command=lambda: self.focus_get().event_generate("<<SelectAll>>"))
+        #display the menu
+        try:
+            self.menu.tk_popup(event.x_root, event.y_root)
+        finally:
+            #make sure to release the grab (Tk 8.0a1 only)
+            self.menu.grab_release()
+#progress bar class with cancel button
+class ProgressbarWithCancel(ctk.CTkToplevel):
+    def __init__(self,max=None, **kw):
+        super().__init__(**kw)
+        self.title("Batching...")
+        self.max = max
+        self.possibleLabels = ['Searching for answers...',"I'm working, I promise.",'ARE THOSE TENTACLES?!','Weird data man...','Another one bites the dust' ,"I think it's a cat?" ,'Looking for the meaning of life', 'Dreaming of captions']
+        self.label = ctk.CTkLabel(self, text="Searching for answers...")
+        self.label.pack(side="top", fill="x", expand=True,padx=10,pady=10)
+        self.progress = ctk.CTkProgressBar(self, orientation="horizontal", mode="determinate")
+        self.progress.pack(side="left", fill="x", expand=True,padx=10,pady=10)
+        self.cancel_button = ctk.CTkButton(self, text="Cancel", command=self.cancel)
+        self.cancel_button.pack(side="right",padx=10,pady=10)
+        self.cancelled = False
+        self.count_label = ctk.CTkLabel(self, text="0/{0}".format(self.max))
+        self.count_label.pack(side="right",padx=10,pady=10)
+    def set_random_label(self):
+        import random
+        self.label["text"] = random.choice(self.possibleLabels)
+        #pop from list
+        #self.possibleLabels.remove(self.label["text"])
+    def cancel(self):
+        self.cancelled = True
+    def set_progress(self, value):
+        self.progress.set(value)
+        self.count_label.configure(text="{0}/{1}".format(int(value * self.max), self.max))
+    def get_progress(self):
+        return self.progress.get
+    def set_max(self, value):
+        return value
+    def get_max(self):
+        return self.progress["maximum"]
+    def is_cancelled(self):
+        return self.cancelled
+    #quit the progress bar window
+#run when imported as a module
+if __name__ == "__main__":
+    #root = tk.Tk()
+    app = ImageBrowser()
+    app.mainloop()

StableTuner_RunPod_Fix/clip_segmentation.py ADDED Viewed

	@@ -0,0 +1,325 @@

+import argparse
+import os
+from typing import Optional, Callable
+import torch
+from PIL import Image
+from torch import Tensor, nn
+from torchvision.transforms import transforms, functional
+from tqdm.auto import tqdm
+from transformers import CLIPSegProcessor, CLIPSegForImageSegmentation
+DEVICE = "cuda"
+def parse_args():
+    parser = argparse.ArgumentParser(description="ClipSeg script.")
+    parser.add_argument(
+        "--sample_dir",
+        type=str,
+        required=True,
+        help="directory where samples are located",
+    )
+    parser.add_argument(
+        "--add_prompt",
+        type=str,
+        required=True,
+        action="append",
+        help="a prompt used to create a mask",
+        dest="prompts",
+    )
+    parser.add_argument(
+        "--mode",
+        type=str,
+        default='fill',
+        required=False,
+        help="Either replace, fill, add or subtract",
+    )
+    parser.add_argument(
+        "--threshold",
+        type=float,
+        default='0.3',
+        required=False,
+        help="threshold for including pixels in the mask",
+    )
+    parser.add_argument(
+        "--smooth_pixels",
+        type=int,
+        default=5,
+        required=False,
+        help="radius of a smoothing operation applied to the generated mask",
+    )
+    parser.add_argument(
+        "--expand_pixels",
+        type=int,
+        default=10,
+        required=False,
+        help="amount of expansion of the generated mask in all directions",
+    )
+    args = parser.parse_args()
+    return args
+class MaskSample:
+    def __init__(self, filename: str):
+        self.image_filename = filename
+        self.mask_filename = os.path.splitext(filename)[0] + "-masklabel.png"
+        self.image = None
+        self.mask_tensor = None
+        self.height = 0
+        self.width = 0
+        self.image2Tensor = transforms.Compose([
+            transforms.ToTensor(),
+        ])
+        self.tensor2Image = transforms.Compose([
+            transforms.ToPILImage(),
+        ])
+    def get_image(self) -> Image:
+        if self.image is None:
+            self.image = Image.open(self.image_filename).convert('RGB')
+            self.height = self.image.height
+            self.width = self.image.width
+        return self.image
+    def get_mask_tensor(self) -> Tensor:
+        if self.mask_tensor is None and os.path.exists(self.mask_filename):
+            mask = Image.open(self.mask_filename).convert('L')
+            mask = self.image2Tensor(mask)
+            mask = mask.to(DEVICE)
+            self.mask_tensor = mask.unsqueeze(0)
+        return self.mask_tensor
+    def set_mask_tensor(self, mask_tensor: Tensor):
+        self.mask_tensor = mask_tensor
+    def add_mask_tensor(self, mask_tensor: Tensor):
+        mask = self.get_mask_tensor()
+        if mask is None:
+            mask = mask_tensor
+        else:
+            mask += mask_tensor
+        mask = torch.clamp(mask, 0, 1)
+        self.mask_tensor = mask
+    def subtract_mask_tensor(self, mask_tensor: Tensor):
+        mask = self.get_mask_tensor()
+        if mask is None:
+            mask = mask_tensor
+        else:
+            mask -= mask_tensor
+        mask = torch.clamp(mask, 0, 1)
+        self.mask_tensor = mask
+    def save_mask(self):
+        if self.mask_tensor is not None:
+            mask = self.mask_tensor.cpu().squeeze()
+            mask = self.tensor2Image(mask).convert('RGB')
+            mask.save(self.mask_filename)
+class ClipSeg:
+    def __init__(self):
+        self.processor = CLIPSegProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
+        self.model = CLIPSegForImageSegmentation.from_pretrained("CIDAS/clipseg-rd64-refined")
+        self.model.eval()
+        self.model.to(DEVICE)
+        self.smoothing_kernel_radius = None
+        self.smoothing_kernel = self.__create_average_kernel(self.smoothing_kernel_radius)
+        self.expand_kernel_radius = None
+        self.expand_kernel = self.__create_average_kernel(self.expand_kernel_radius)
+    @staticmethod
+    def __create_average_kernel(kernel_radius: Optional[int]):
+        if kernel_radius is None:
+            return None
+        kernel_size = kernel_radius * 2 + 1
+        kernel_weights = torch.ones(1, 1, kernel_size, kernel_size) / (kernel_size * kernel_size)
+        kernel = nn.Conv2d(in_channels=1, out_channels=1, kernel_size=kernel_size, bias=False, padding_mode='replicate', padding=kernel_radius)
+        kernel.weight.data = kernel_weights
+        kernel.requires_grad_(False)
+        kernel.to(DEVICE)
+        return kernel
+    @staticmethod
+    def __get_sample_filenames(sample_dir: str) -> [str]:
+        filenames = []
+        for filename in os.listdir(sample_dir):
+            ext = os.path.splitext(filename)[1].lower()
+            if ext in ['.jpg', '.jpeg', '.png', '.bmp', '.webp'] and '-masklabel.png' not in filename:
+                filenames.append(os.path.join(sample_dir, filename))
+        return filenames
+    def __process_mask(self, mask: Tensor, target_height: int, target_width: int, threshold: float) -> Tensor:
+        while len(mask.shape) < 4:
+            mask = mask.unsqueeze(0)
+        mask = torch.sigmoid(mask)
+        mask = mask.sum(1).unsqueeze(1)
+        if self.smoothing_kernel is not None:
+            mask = self.smoothing_kernel(mask)
+        mask = functional.resize(mask, [target_height, target_width])
+        mask = (mask > threshold).float()
+        if self.expand_kernel is not None:
+            mask = self.expand_kernel(mask)
+        mask = (mask > 0).float()
+        return mask
+    def mask_image(self, filename: str, prompts: [str], mode: str = 'fill', threshold: float = 0.3, smooth_pixels: int = 5, expand_pixels: int = 10):
+        """
+        Masks a sample
+        Parameters:
+            filename (`str`): a sample filename
+            prompts (`[str]`): a list of prompts used to create a mask
+            mode (`str`): can be one of
+                - replace: creates new masks for all samples, even if a mask already exists
+                - fill: creates new masks for all samples without a mask
+                - add: adds the new region to existing masks
+                - subtract: subtracts the new region from existing masks
+            threshold (`float`): threshold for including pixels in the mask
+            smooth_pixels (`int`): radius of a smoothing operation applied to the generated mask
+            expand_pixels (`int`): amount of expansion of the generated mask in all directions
+        """
+        mask_sample = MaskSample(filename)
+        if mode == 'fill' and mask_sample.get_mask_tensor() is not None:
+            return
+        if self.smoothing_kernel_radius != smooth_pixels:
+            self.smoothing_kernel = self.__create_average_kernel(smooth_pixels)
+            self.smoothing_kernel_radius = smooth_pixels
+        if self.expand_kernel_radius != expand_pixels:
+            self.expand_kernel = self.__create_average_kernel(expand_pixels)
+            self.expand_kernel_radius = expand_pixels
+        inputs = self.processor(text=prompts, images=[mask_sample.get_image()] * len(prompts), padding="max_length", return_tensors="pt")
+        inputs.to(DEVICE)
+        with torch.no_grad():
+            outputs = self.model(**inputs)
+        predicted_mask = self.__process_mask(outputs.logits, mask_sample.height, mask_sample.width, threshold)
+        if mode == 'replace' or mode == 'fill':
+            mask_sample.set_mask_tensor(predicted_mask)
+        elif mode == 'add':
+            mask_sample.add_mask_tensor(predicted_mask)
+        elif mode == 'subtract':
+            mask_sample.subtract_mask_tensor(predicted_mask)
+        mask_sample.save_mask()
+    def mask_folder(
+            self,
+            sample_dir: str,
+            prompts: [str],
+            mode: str = 'fill',
+            threshold: float = 0.3,
+            smooth_pixels: int = 5,
+            expand_pixels: int = 10,
+            progress_callback: Callable[[int, int], None] = None,
+            error_callback: Callable[[str], None] = None,
+    ):
+        """
+        Masks all samples in a folder
+        Parameters:
+            sample_dir (`str`): directory where samples are located
+            prompts (`[str]`): a list of prompts used to create a mask
+            mode (`str`): can be one of
+                - replace: creates new masks for all samples, even if a mask already exists
+                - fill: creates new masks for all samples without a mask
+                - add: adds the new region to existing masks
+                - subtract: subtracts the new region from existing masks
+            threshold (`float`): threshold for including pixels in the mask
+            smooth_pixels (`int`): radius of a smoothing operation applied to the generated mask
+            expand_pixels (`int`): amount of expansion of the generated mask in all directions
+            progress_callback (`Callable[[int, int], None]`): called after every processed image
+            error_callback (`Callable[[str], None]`): called for every exception
+        """
+        filenames = self.__get_sample_filenames(sample_dir)
+        self.mask_images(
+            filenames=filenames,
+            prompts=prompts,
+            mode=mode,
+            threshold=threshold,
+            smooth_pixels=smooth_pixels,
+            expand_pixels=expand_pixels,
+            progress_callback=progress_callback,
+            error_callback=error_callback,
+        )
+    def mask_images(
+            self,
+            filenames: [str],
+            prompts: [str],
+            mode: str = 'fill',
+            threshold: float = 0.3,
+            smooth_pixels: int = 5,
+            expand_pixels: int = 10,
+            progress_callback: Callable[[int, int], None] = None,
+            error_callback: Callable[[str], None] = None,
+    ):
+        """
+        Masks all samples in a list
+        Parameters:
+            filenames (`[str]`): a list of sample filenames
+            prompts (`[str]`): a list of prompts used to create a mask
+            mode (`str`): can be one of
+                - replace: creates new masks for all samples, even if a mask already exists
+                - fill: creates new masks for all samples without a mask
+                - add: adds the new region to existing masks
+                - subtract: subtracts the new region from existing masks
+            threshold (`float`): threshold for including pixels in the mask
+            smooth_pixels (`int`): radius of a smoothing operation applied to the generated mask
+            expand_pixels (`int`): amount of expansion of the generated mask in all directions
+            progress_callback (`Callable[[int, int], None]`): called after every processed image
+            error_callback (`Callable[[str], None]`): called for every exception
+        """
+        if progress_callback is not None:
+            progress_callback(0, len(filenames))
+        for i, filename in enumerate(tqdm(filenames)):
+            try:
+                self.mask_image(filename, prompts, mode, threshold, smooth_pixels, expand_pixels)
+            except Exception as e:
+                if error_callback is not None:
+                    error_callback(filename)
+            if progress_callback is not None:
+                progress_callback(i + 1, len(filenames))
+def main():
+    args = parse_args()
+    clip_seg = ClipSeg()
+    clip_seg.mask_folder(
+        sample_dir=args.sample_dir,
+        prompts=args.prompts,
+        mode=args.mode,
+        threshold=args.threshold,
+        smooth_pixels=args.smooth_pixels,
+        expand_pixels=args.expand_pixels,
+        error_callback=lambda filename: print("Error while processing image " + filename)
+    )
+if __name__ == "__main__":
+    main()

StableTuner_RunPod_Fix/configuration_gui.py ADDED Viewed

The diff for this file is too large to render. See raw diff

StableTuner_RunPod_Fix/convert_diffusers_to_sd_cli.py ADDED Viewed

	@@ -0,0 +1,22 @@

+import sys
+import os
+try:
+    import converters
+except ImportError:
+    #if there's a scripts folder where the script is, add it to the path
+    if 'scripts' in os.listdir(os.path.dirname(os.path.abspath(__file__))):
+        sys.path.append(os.path.dirname(os.path.abspath(__file__)) + '\\scripts')
+    else:
+        print('Could not find scripts folder. Please add it to the path manually or place this file in it.')
+    import converters
+if __name__ == '__main__':
+    args = sys.argv[1:]
+    if len(args) != 2:
+        print('Usage: python3 convert_diffusers_to_sd.py <model_path> <output_path>')
+        sys.exit(1)
+    model_path = args[0]
+    output_path = args[1]
+    converters.Convert_Diffusers_to_SD(model_path, output_path)

StableTuner_RunPod_Fix/converters.py ADDED Viewed

	@@ -0,0 +1,120 @@

+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import requests
+import os
+import os.path as osp
+import torch
+try:
+    from omegaconf import OmegaConf
+except ImportError:
+    raise ImportError(
+        "OmegaConf is required to convert the LDM checkpoints. Please install it with `pip install OmegaConf`."
+    )
+from diffusers import (
+    AutoencoderKL,
+    DDIMScheduler,
+    DPMSolverMultistepScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
+    HeunDiscreteScheduler,
+    LDMTextToImagePipeline,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+    StableDiffusionPipeline,
+    UNet2DConditionModel,
+    DiffusionPipeline
+)
+from diffusers.pipelines.latent_diffusion.pipeline_latent_diffusion import LDMBertConfig, LDMBertModel
+#from diffusers.pipelines.paint_by_example import PaintByExampleImageEncoder, PaintByExamplePipeline
+from diffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker
+from transformers import AutoFeatureExtractor, BertTokenizerFast, CLIPTextModel, CLIPTokenizer, CLIPVisionConfig, CLIPTextConfig
+import model_util
+class Convert_SD_to_Diffusers():
+    def __init__(self, checkpoint_path, output_path, prediction_type=None, img_size=None, original_config_file=None, extract_ema=False, num_in_channels=None,pipeline_type=None,scheduler_type=None,sd_version=None,half=None,version=None):
+        self.checkpoint_path = checkpoint_path
+        self.output_path = output_path
+        self.prediction_type = prediction_type
+        self.img_size = img_size
+        self.original_config_file = original_config_file
+        self.extract_ema = extract_ema
+        self.num_in_channels = num_in_channels
+        self.pipeline_type = pipeline_type
+        self.scheduler_type = scheduler_type
+        self.sd_version = sd_version
+        self.half = half
+        self.version = version
+        self.main()
+    def main(self):
+        image_size = self.img_size
+        prediction_type = self.prediction_type
+        original_config_file = self.original_config_file
+        num_in_channels = self.num_in_channels
+        scheduler_type = self.scheduler_type
+        pipeline_type = self.pipeline_type
+        extract_ema = self.extract_ema
+        reference_diffusers_model = None
+        if self.version == 'v1':
+            is_v1 = True
+            is_v2 = False
+        if self.version == 'v2':
+            is_v1 = False
+            is_v2 = True
+        if is_v2 == True and prediction_type == 'vprediction':
+            reference_diffusers_model = 'stabilityai/stable-diffusion-2'
+        if is_v2 == True and prediction_type == 'epsilon':
+            reference_diffusers_model = 'stabilityai/stable-diffusion-2-base'
+        if is_v1 == True and prediction_type == 'epsilon':
+            reference_diffusers_model = 'runwayml/stable-diffusion-v1-5'
+        dtype = 'fp16' if self.half else None
+        v2_model = True if is_v2 else False
+        print(f"loading model from: {self.checkpoint_path}")
+        #print(v2_model)
+        text_encoder, vae, unet = model_util.load_models_from_stable_diffusion_checkpoint(v2_model, self.checkpoint_path)
+        print(f"copy scheduler/tokenizer config from: {reference_diffusers_model}")
+        model_util.save_diffusers_checkpoint(v2_model, self.output_path, text_encoder, unet, reference_diffusers_model, vae)
+        print(f"Diffusers model saved.")
+class Convert_Diffusers_to_SD():
+    def __init__(self,model_path=None, output_path=None):
+        pass
+        def main(model_path:str, output_path:str):
+            #print(model_path)
+            #print(output_path)
+            global_step = None
+            epoch = None
+            dtype = torch.float32
+            pipe = DiffusionPipeline.from_pretrained(model_path, torch_dtype=dtype, tokenizer=None, safety_checker=None)
+            text_encoder = pipe.text_encoder
+            vae = pipe.vae
+            if os.path.exists(os.path.join(model_path, "ema_unet")):
+                pipe.unet = UNet2DConditionModel.from_pretrained(
+                    model_path,
+                    subfolder="ema_unet",
+                    torch_dtype=dtype
+                )
+            unet = pipe.unet
+            v2_model = unet.config.cross_attention_dim == 1024
+            original_model = None
+            key_count = model_util.save_stable_diffusion_checkpoint(v2_model, output_path, text_encoder, unet,
+                                                              original_model, epoch, global_step, dtype, vae)
+            print(f"Saved model")
+        return main(model_path, output_path)

StableTuner_RunPod_Fix/dataloaders_util.py ADDED Viewed

	@@ -0,0 +1,1331 @@

+import random
+import math
+import os
+import torch
+import torch.utils.checkpoint
+from torch.utils.data import Dataset
+from torchvision import transforms
+from tqdm.auto import tqdm
+import numpy as np
+from PIL import Image
+from trainer_util import *
+from clip_segmentation import ClipSeg
+class bcolors:
+    HEADER = '\033[95m'
+    OKBLUE = '\033[94m'
+    OKCYAN = '\033[96m'
+    OKGREEN = '\033[92m'
+    WARNING = '\033[93m'
+    FAIL = '\033[91m'
+    ENDC = '\033[0m'
+    BOLD = '\033[1m'
+    UNDERLINE = '\033[4m'
+ASPECT_2048 = [[2048, 2048],
+[2112, 1984],[1984, 2112],
+[2176, 1920],[1920, 2176],
+[2240, 1856],[1856, 2240],
+[2304, 1792],[1792, 2304],
+[2368, 1728],[1728, 2368],
+[2432, 1664],[1664, 2432],
+[2496, 1600],[1600, 2496],
+[2560, 1536],[1536, 2560],
+[2624, 1472],[1472, 2624]]
+ASPECT_1984 = [[1984, 1984],
+[2048, 1920],[1920, 2048],
+[2112, 1856],[1856, 2112],
+[2176, 1792],[1792, 2176],
+[2240, 1728],[1728, 2240],
+[2304, 1664],[1664, 2304],
+[2368, 1600],[1600, 2368],
+[2432, 1536],[1536, 2432],
+[2496, 1472],[1472, 2496],
+[2560, 1408],[1408, 2560]]
+ASPECT_1920 = [[1920, 1920],
+[1984, 1856],[1856, 1984],
+[2048, 1792],[1792, 2048],
+[2112, 1728],[1728, 2112],
+[2176, 1664],[1664, 2176],
+[2240, 1600],[1600, 2240],
+[2304, 1536],[1536, 2304],
+[2368, 1472],[1472, 2368],
+[2432, 1408],[1408, 2432],
+[2496, 1344],[1344, 2496]]
+ASPECT_1856 = [[1856, 1856],
+[1920, 1792],[1792, 1920],
+[1984, 1728],[1728, 1984],
+[2048, 1664],[1664, 2048],
+[2112, 1600],[1600, 2112],
+[2176, 1536],[1536, 2176],
+[2240, 1472],[1472, 2240],
+[2304, 1408],[1408, 2304],
+[2368, 1344],[1344, 2368],
+[2432, 1280],[1280, 2432]]
+ASPECT_1792 = [[1792, 1792],
+[1856, 1728],[1728, 1856],
+[1920, 1664],[1664, 1920],
+[1984, 1600],[1600, 1984],
+[2048, 1536],[1536, 2048],
+[2112, 1472],[1472, 2112],
+[2176, 1408],[1408, 2176],
+[2240, 1344],[1344, 2240],
+[2304, 1280],[1280, 2304],
+[2368, 1216],[1216, 2368]]
+ASPECT_1728 = [[1728, 1728],
+[1792, 1664],[1664, 1792],
+[1856, 1600],[1600, 1856],
+[1920, 1536],[1536, 1920],
+[1984, 1472],[1472, 1984],
+[2048, 1408],[1408, 2048],
+[2112, 1344],[1344, 2112],
+[2176, 1280],[1280, 2176],
+[2240, 1216],[1216, 2240],
+[2304, 1152],[1152, 2304]]
+ASPECT_1664 = [[1664, 1664],
+[1728, 1600],[1600, 1728],
+[1792, 1536],[1536, 1792],
+[1856, 1472],[1472, 1856],
+[1920, 1408],[1408, 1920],
+[1984, 1344],[1344, 1984],
+[2048, 1280],[1280, 2048],
+[2112, 1216],[1216, 2112],
+[2176, 1152],[1152, 2176],
+[2240, 1088],[1088, 2240]]
+ASPECT_1600 = [[1600, 1600],
+[1664, 1536],[1536, 1664],
+[1728, 1472],[1472, 1728],
+[1792, 1408],[1408, 1792],
+[1856, 1344],[1344, 1856],
+[1920, 1280],[1280, 1920],
+[1984, 1216],[1216, 1984],
+[2048, 1152],[1152, 2048],
+[2112, 1088],[1088, 2112],
+[2176, 1024],[1024, 2176]]
+ASPECT_1536 = [[1536, 1536],
+[1600, 1472],[1472, 1600],
+[1664, 1408],[1408, 1664],
+[1728, 1344],[1344, 1728],
+[1792, 1280],[1280, 1792],
+[1856, 1216],[1216, 1856],
+[1920, 1152],[1152, 1920],
+[1984, 1088],[1088, 1984],
+[2048, 1024],[1024, 2048],
+[2112, 960],[960, 2112]]
+ASPECT_1472 = [[1472, 1472],
+[1536, 1408],[1408, 1536],
+[1600, 1344],[1344, 1600],
+[1664, 1280],[1280, 1664],
+[1728, 1216],[1216, 1728],
+[1792, 1152],[1152, 1792],
+[1856, 1088],[1088, 1856],
+[1920, 1024],[1024, 1920],
+[1984, 960],[960, 1984],
+[2048, 896],[896, 2048]]
+ASPECT_1408 = [[1408, 1408],
+[1472, 1344],[1344, 1472],
+[1536, 1280],[1280, 1536],
+[1600, 1216],[1216, 1600],
+[1664, 1152],[1152, 1664],
+[1728, 1088],[1088, 1728],
+[1792, 1024],[1024, 1792],
+[1856, 960],[960, 1856],
+[1920, 896],[896, 1920],
+[1984, 832],[832, 1984]]
+ASPECT_1344 = [[1344, 1344],
+[1408, 1280],[1280, 1408],
+[1472, 1216],[1216, 1472],
+[1536, 1152],[1152, 1536],
+[1600, 1088],[1088, 1600],
+[1664, 1024],[1024, 1664],
+[1728, 960],[960, 1728],
+[1792, 896],[896, 1792],
+[1856, 832],[832, 1856],
+[1920, 768],[768, 1920]]
+ASPECT_1280 = [[1280, 1280],
+[1344, 1216],[1216, 1344],
+[1408, 1152],[1152, 1408],
+[1472, 1088],[1088, 1472],
+[1536, 1024],[1024, 1536],
+[1600, 960],[960, 1600],
+[1664, 896],[896, 1664],
+[1728, 832],[832, 1728],
+[1792, 768],[768, 1792],
+[1856, 704],[704, 1856]]
+ASPECT_1216 = [[1216, 1216],
+[1280, 1152],[1152, 1280],
+[1344, 1088],[1088, 1344],
+[1408, 1024],[1024, 1408],
+[1472, 960],[960, 1472],
+[1536, 896],[896, 1536],
+[1600, 832],[832, 1600],
+[1664, 768],[768, 1664],
+[1728, 704],[704, 1728],
+[1792, 640],[640, 1792]]
+ASPECT_1152 = [[1152, 1152],
+[1216, 1088],[1088, 1216],
+[1280, 1024],[1024, 1280],
+[1344, 960],[960, 1344],
+[1408, 896],[896, 1408],
+[1472, 832],[832, 1472],
+[1536, 768],[768, 1536],
+[1600, 704],[704, 1600],
+[1664, 640],[640, 1664],
+[1728, 576],[576, 1728]]
+ASPECT_1088 = [[1088, 1088],
+[1152, 1024],[1024, 1152],
+[1216, 960],[960, 1216],
+[1280, 896],[896, 1280],
+[1344, 832],[832, 1344],
+[1408, 768],[768, 1408],
+[1472, 704],[704, 1472],
+[1536, 640],[640, 1536],
+[1600, 576],[576, 1600],
+[1664, 512],[512, 1664]]
+ASPECT_832 = [[832, 832],
+[896, 768], [768, 896],
+[960, 704], [704, 960],
+[1024, 640], [640, 1024],
+[1152, 576], [576, 1152],
+[1280, 512], [512, 1280],
+[1344, 512], [512, 1344],
+[1408, 448], [448, 1408],
+[1472, 448], [448, 1472],
+[1536, 384], [384, 1536],
+[1600, 384], [384, 1600]]
+ASPECT_896 = [[896, 896],
+[960, 832], [832, 960],
+[1024, 768], [768, 1024],
+[1088, 704], [704, 1088],
+[1152, 704], [704, 1152],
+[1216, 640], [640, 1216],
+[1280, 640], [640, 1280],
+[1344, 576], [576, 1344],
+[1408, 576], [576, 1408],
+[1472, 512], [512, 1472],
+[1536, 512], [512, 1536],
+[1600, 448], [448, 1600],
+[1664, 448], [448, 1664]]
+ASPECT_960 = [[960, 960],
+[1024, 896],[896, 1024],
+[1088, 832],[832, 1088],
+[1152, 768],[768, 1152],
+[1216, 704],[704, 1216],
+[1280, 640],[640, 1280],
+[1344, 576],[576, 1344],
+[1408, 512],[512, 1408],
+[1472, 448],[448, 1472],
+[1536, 384],[384, 1536]]
+ASPECT_1024 = [[1024, 1024],
+[1088, 960], [960, 1088],
+[1152, 896], [896, 1152],
+[1216, 832], [832, 1216],
+[1344, 768], [768, 1344],
+[1472, 704], [704, 1472],
+[1600, 640], [640, 1600],
+[1728, 576], [576, 1728],
+[1792, 576], [576, 1792]]
+ASPECT_768 = [[768,768],     # 589824 1:1
+    [896,640],[640,896],   # 573440 1.4:1
+    [832,704],[704,832],   # 585728 1.181:1
+    [960,576],[576,960],   # 552960 1.6:1
+    [1024,576],[576,1024], # 524288 1.778:1
+    [1088,512],[512,1088], # 497664 2.125:1
+    [1152,512],[512,1152], # 589824 2.25:1
+    [1216,448],[448,1216], # 552960 2.714:1
+    [1280,448],[448,1280], # 573440 2.857:1
+    [1344,384],[384,1344], # 518400 3.5:1
+    [1408,384],[384,1408], # 540672 3.667:1
+    [1472,320],[320,1472], # 470400 4.6:1
+    [1536,320],[320,1536], # 491520 4.8:1
+]
+ASPECT_704 = [[704,704],     # 501,376 1:1
+    [768,640],[640,768],   # 491,520 1.2:1
+    [832,576],[576,832],   # 458,752 1.444:1
+    [896,512],[512,896],   # 458,752 1.75:1
+    [960,512],[512,960],   # 491,520 1.875:1
+    [1024,448],[448,1024], # 458,752 2.286:1
+    [1088,448],[448,1088], # 487,424 2.429:1
+    [1152,384],[384,1152], # 442,368 3:1
+    [1216,384],[384,1216], # 466,944 3.125:1
+    [1280,384],[384,1280], # 491,520 3.333:1
+    [1280,320],[320,1280], # 409,600 4:1
+    [1408,320],[320,1408], # 450,560 4.4:1
+    [1536,320],[320,1536], # 491,520 4.8:1
+]
+ASPECT_640 = [[640,640],     # 409600 1:1
+    [704,576],[576,704],   # 405504 1.25:1
+    [768,512],[512,768],   # 393216 1.5:1
+    [896,448],[448,896],   # 401408 2:1
+    [1024,384],[384,1024], # 393216 2.667:1
+    [1280,320],[320,1280], # 409600 4:1
+    [1408,256],[256,1408], # 360448 5.5:1
+    [1472,256],[256,1472], # 376832 5.75:1
+    [1536,256],[256,1536], # 393216 6:1
+    [1600,256],[256,1600], # 409600 6.25:1
+]
+ASPECT_576 = [[576,576],     # 331776 1:1
+    [640,512],[512,640],   # 327680 1.25:1
+    [640,448],[448,640],   # 286720 1.4286:1
+    [704,448],[448,704],   # 314928 1.5625:1
+    [832,384],[384,832],   # 317440 2.1667:1
+    [1024,320],[320,1024], # 327680 3.2:1
+    [1280,256],[256,1280], # 327680 5:1
+]
+ASPECT_512 = [[512,512],     # 262144 1:1
+    [576,448],[448,576],   # 258048 1.29:1
+    [640,384],[384,640],   # 245760 1.667:1
+    [768,320],[320,768],   # 245760 2.4:1
+    [832,256],[256,832],   # 212992 3.25:1
+    [896,256],[256,896],   # 229376 3.5:1
+    [960,256],[256,960],   # 245760 3.75:1
+    [1024,256],[256,1024], # 245760 4:1
+    ]
+ASPECT_448 = [[448,448],   # 200704 1:1
+    [512,384],[384,512],   # 196608 1.33:1
+    [576,320],[320,576],   # 184320 1.8:1
+    [768,256],[256,768],   # 196608 3:1
+    ]
+ASPECT_384 = [[384,384],   # 147456 1:1
+    [448,320],[320,448],   # 143360 1.4:1
+    [576,256],[256,576],   # 147456 2.25:1
+    [768,192],[192,768],   # 147456 4:1
+    ]
+ASPECT_320 = [[320,320],   # 102400 1:1
+    [384,256],[256,384],   # 98304 1.5:1
+    [512,192],[192,512],   # 98304 2.67:1
+    ]
+ASPECT_256 = [[256,256],   # 65536 1:1
+    [320,192],[192,320],   # 61440 1.67:1
+    [512,128],[128,512],   # 65536 4:1
+    ]
+#failsafe aspects
+ASPECTS = ASPECT_512
+def get_aspect_buckets(resolution,mode=''):
+    if resolution < 256:
+        raise ValueError("Resolution must be at least 512")
+    try:
+        rounded_resolution = int(resolution / 64) * 64
+        print(f" {bcolors.WARNING} Rounded resolution to: {rounded_resolution}{bcolors.ENDC}")
+        all_image_sizes = __get_all_aspects()
+        if mode == 'MJ':
+            #truncate to the first 3 resolutions
+            all_image_sizes = [x[0:3] for x in all_image_sizes]
+        aspects = next(filter(lambda sizes: sizes[0][0]==rounded_resolution, all_image_sizes), None)
+        ASPECTS = aspects
+        #print(aspects)
+        return aspects
+    except Exception as e:
+        print(f" {bcolors.FAIL} *** Could not find selected resolution: {rounded_resolution}{bcolors.ENDC}")
+        raise e
+def __get_all_aspects():
+    return [ASPECT_256, ASPECT_320, ASPECT_384, ASPECT_448, ASPECT_512, ASPECT_576, ASPECT_640, ASPECT_704, ASPECT_768,ASPECT_832,ASPECT_896,ASPECT_960,ASPECT_1024,ASPECT_1088,ASPECT_1152,ASPECT_1216,ASPECT_1280,ASPECT_1344,ASPECT_1408,ASPECT_1472,ASPECT_1536,ASPECT_1600,ASPECT_1664,ASPECT_1728,ASPECT_1792,ASPECT_1856,ASPECT_1920,ASPECT_1984,ASPECT_2048]
+class AutoBucketing(Dataset):
+    def __init__(self,
+                    concepts_list,
+                    tokenizer=None,
+                    flip_p=0.0,
+                    repeats=1,
+                    debug_level=0,
+                    batch_size=1,
+                    set='val',
+                    resolution=512,
+                    center_crop=False,
+                    use_image_names_as_captions=True,
+                    shuffle_captions=False,
+                    add_class_images_to_dataset=None,
+                    balance_datasets=False,
+                    crop_jitter=20,
+                    with_prior_loss=False,
+                    use_text_files_as_captions=False,
+                    aspect_mode='dynamic',
+                    action_preference='dynamic',
+                    seed=555,
+                    model_variant='base',
+                    extra_module=None,
+                    mask_prompts=None,
+                    load_mask=False,
+                    ):
+        self.debug_level = debug_level
+        self.resolution = resolution
+        self.center_crop = center_crop
+        self.tokenizer = tokenizer
+        self.batch_size = batch_size
+        self.concepts_list = concepts_list
+        self.use_image_names_as_captions = use_image_names_as_captions
+        self.shuffle_captions = shuffle_captions
+        self.num_train_images = 0
+        self.num_reg_images = 0
+        self.image_train_items = []
+        self.image_reg_items = []
+        self.add_class_images_to_dataset = add_class_images_to_dataset
+        self.balance_datasets = balance_datasets
+        self.crop_jitter = crop_jitter
+        self.with_prior_loss = with_prior_loss
+        self.use_text_files_as_captions = use_text_files_as_captions
+        self.aspect_mode = aspect_mode
+        self.action_preference = action_preference
+        self.model_variant = model_variant
+        self.extra_module = extra_module
+        self.image_transforms = transforms.Compose(
+            [
+                transforms.ToTensor(),
+                transforms.Normalize([0.5], [0.5]),
+            ]
+        )
+        self.mask_transforms = transforms.Compose(
+            [
+                transforms.ToTensor(),
+            ]
+        )
+        self.depth_image_transforms = transforms.Compose(
+            [
+                transforms.ToTensor(),
+            ]
+        )
+        self.seed = seed
+        #shared_dataloader = None
+        print(f" {bcolors.WARNING}Creating Auto Bucketing Dataloader{bcolors.ENDC}")
+        shared_dataloader = DataLoaderMultiAspect(concepts_list,
+         debug_level=debug_level,
+         resolution=self.resolution,
+         seed=self.seed,
+         batch_size=self.batch_size,
+         flip_p=flip_p,
+         use_image_names_as_captions=self.use_image_names_as_captions,
+         add_class_images_to_dataset=self.add_class_images_to_dataset,
+         balance_datasets=self.balance_datasets,
+         with_prior_loss=self.with_prior_loss,
+         use_text_files_as_captions=self.use_text_files_as_captions,
+         aspect_mode=self.aspect_mode,
+         action_preference=self.action_preference,
+         model_variant=self.model_variant,
+         extra_module=self.extra_module,
+         mask_prompts=mask_prompts,
+         load_mask=load_mask,
+        )
+        #print(self.image_train_items)
+        if self.with_prior_loss and self.add_class_images_to_dataset == False:
+            self.image_train_items, self.class_train_items = shared_dataloader.get_all_images()
+            self.num_train_images = self.num_train_images + len(self.image_train_items)
+            self.num_reg_images = self.num_reg_images + len(self.class_train_items)
+            self._length = max(max(math.trunc(self.num_train_images * repeats), batch_size),math.trunc(self.num_reg_images * repeats), batch_size) - self.num_train_images % self.batch_size
+            self.num_train_images = self.num_train_images + self.num_reg_images
+        else:
+            self.image_train_items = shared_dataloader.get_all_images()
+            self.num_train_images = self.num_train_images + len(self.image_train_items)
+            self._length = max(math.trunc(self.num_train_images * repeats), batch_size) - self.num_train_images % self.batch_size
+        print()
+        print(f" {bcolors.WARNING} ** Validation Set: {set}, steps: {self._length / batch_size:.0f}, repeats: {repeats} {bcolors.ENDC}")
+        print()
+    def __len__(self):
+        return self._length
+    def __getitem__(self, i):
+        idx = i % self.num_train_images
+        #print(idx)
+        image_train_item = self.image_train_items[idx]
+        example = self.__get_image_for_trainer(image_train_item,debug_level=self.debug_level)
+        if self.with_prior_loss and self.add_class_images_to_dataset == False:
+            idx = i % self.num_reg_images
+            class_train_item = self.class_train_items[idx]
+            example_class = self.__get_image_for_trainer(class_train_item,debug_level=self.debug_level,class_img=True)
+            example= {**example, **example_class}
+        #print the tensor shape
+        #print(example['instance_images'].shape)
+        #print(example.keys())
+        return example
+    def normalize8(self,I):
+            mn = I.min()
+            mx = I.max()
+            mx -= mn
+            I = ((I - mn)/mx) * 255
+            return I.astype(np.uint8)
+    def __get_image_for_trainer(self,image_train_item,debug_level=0,class_img=False):
+        example = {}
+        save = debug_level > 2
+        if class_img==False:
+            image_train_tmp = image_train_item.hydrate(crop=False, save=0, crop_jitter=self.crop_jitter)
+            image_train_tmp_image = Image.fromarray(self.normalize8(image_train_tmp.image)).convert("RGB")
+            instance_prompt = image_train_tmp.caption
+            if self.shuffle_captions:
+                caption_parts = instance_prompt.split(",")
+                random.shuffle(caption_parts)
+                instance_prompt = ",".join(caption_parts)
+            example["instance_images"] = self.image_transforms(image_train_tmp_image)
+            if image_train_tmp.mask is not None:
+                image_train_tmp_mask = Image.fromarray(self.normalize8(image_train_tmp.mask)).convert("L")
+                example["mask"] = self.mask_transforms(image_train_tmp_mask)
+            if self.model_variant == 'depth2img':
+                image_train_tmp_depth = Image.fromarray(self.normalize8(image_train_tmp.extra)).convert("L")
+                example["instance_depth_images"] = self.depth_image_transforms(image_train_tmp_depth)
+            #print(instance_prompt)
+            example["instance_prompt_ids"] = self.tokenizer(
+                instance_prompt,
+                padding="do_not_pad",
+                truncation=True,
+                max_length=self.tokenizer.model_max_length,
+            ).input_ids
+            image_train_item.self_destruct()
+            return example
+        if class_img==True:
+            image_train_tmp = image_train_item.hydrate(crop=False, save=4, crop_jitter=self.crop_jitter)
+            image_train_tmp_image = Image.fromarray(self.normalize8(image_train_tmp.image)).convert("RGB")
+            if self.model_variant == 'depth2img':
+                image_train_tmp_depth = Image.fromarray(self.normalize8(image_train_tmp.extra)).convert("L")
+                example["class_depth_images"] = self.depth_image_transforms(image_train_tmp_depth)
+            example["class_images"] = self.image_transforms(image_train_tmp_image)
+            example["class_prompt_ids"] = self.tokenizer(
+                image_train_tmp.caption,
+                padding="do_not_pad",
+                truncation=True,
+                max_length=self.tokenizer.model_max_length,
+            ).input_ids
+            image_train_item.self_destruct()
+            return example
+_RANDOM_TRIM = 0.04
+class ImageTrainItem():
+    """
+    image: Image
+    mask: Image
+    extra: Image
+    identifier: caption,
+    target_aspect: (width, height),
+    pathname: path to image file
+    flip_p: probability of flipping image (0.0 to 1.0)
+    """
+    def __init__(self, image: Image, mask: Image, extra: Image, caption: str, target_wh: list, pathname: str, flip_p=0.0, model_variant='base', load_mask=False):
+        self.caption = caption
+        self.target_wh = target_wh
+        self.pathname = pathname
+        self.mask_pathname = os.path.splitext(pathname)[0] + "-masklabel.png"
+        self.depth_pathname = os.path.splitext(pathname)[0] + "-depth.png"
+        self.flip_p = flip_p
+        self.flip = transforms.RandomHorizontalFlip(p=flip_p)
+        self.cropped_img = None
+        self.model_variant = model_variant
+        self.load_mask=load_mask
+        self.is_dupe = []
+        self.variant_warning = False
+        self.image = image
+        self.mask = mask
+        self.extra = extra
+    def self_destruct(self):
+        self.image = None
+        self.mask = None
+        self.extra = None
+        self.cropped_img = None
+        self.is_dupe.append(1)
+    def load_image(self, pathname, crop, jitter_amount, flip):
+        if len(self.is_dupe) > 0:
+            self.flip = transforms.RandomHorizontalFlip(p=1.0 if flip else 0.0)
+        image = Image.open(pathname).convert('RGB')
+        width, height = image.size
+        if crop:
+            cropped_img = self.__autocrop(image)
+            image = cropped_img.resize((512, 512), resample=Image.Resampling.LANCZOS)
+        else:
+            width, height = image.size
+            if self.target_wh[0] == self.target_wh[1]:
+                if width > height:
+                    left = random.randint(0, width - height)
+                    image = image.crop((left, 0, height + left, height))
+                    width = height
+                elif height > width:
+                    top = random.randint(0, height - width)
+                    image = image.crop((0, top, width, width + top))
+                    height = width
+                elif width > self.target_wh[0]:
+                    slice = min(int(self.target_wh[0] * _RANDOM_TRIM), width - self.target_wh[0])
+                    slicew_ratio = random.random()
+                    left = int(slice * slicew_ratio)
+                    right = width - int(slice * (1 - slicew_ratio))
+                    sliceh_ratio = random.random()
+                    top = int(slice * sliceh_ratio)
+                    bottom = height - int(slice * (1 - sliceh_ratio))
+                    image = image.crop((left, top, right, bottom))
+            else:
+                image_aspect = width / height
+                target_aspect = self.target_wh[0] / self.target_wh[1]
+                if image_aspect > target_aspect:
+                    new_width = int(height * target_aspect)
+                    jitter_amount = max(min(jitter_amount, int(abs(width - new_width) / 2)), 0)
+                    left = jitter_amount
+                    right = left + new_width
+                    image = image.crop((left, 0, right, height))
+                else:
+                    new_height = int(width / target_aspect)
+                    jitter_amount = max(min(jitter_amount, int(abs(height - new_height) / 2)), 0)
+                    top = jitter_amount
+                    bottom = top + new_height
+                    image = image.crop((0, top, width, bottom))
+                    # LAZCOS resample
+            image = image.resize(self.target_wh, resample=Image.Resampling.LANCZOS)
+            # print the pixel count of the image
+            # print path to image file
+            # print(self.pathname)
+            # print(self.image.size[0] * self.image.size[1])
+            image = self.flip(image)
+        return image
+    def hydrate(self, crop=False, save=False, crop_jitter=20):
+        """
+        crop: hard center crop to 512x512
+        save: save the cropped image to disk, for manual inspection of resize/crop
+        crop_jitter: randomly shift cropp by N pixels when using multiple aspect ratios to improve training quality
+        """
+        if self.image is None:
+            chance = float(len(self.is_dupe)) / 10.0
+            flip_p = self.flip_p + chance if chance < 1.0 else 1.0
+            flip = random.uniform(0, 1) < flip_p
+            if len(self.is_dupe) > 0:
+                crop_jitter = crop_jitter + (len(self.is_dupe) * 10) if crop_jitter < 50 else 50
+            jitter_amount = random.randint(0, crop_jitter)
+            self.image = self.load_image(self.pathname, crop, jitter_amount, flip)
+            if self.model_variant == "inpainting" or self.load_mask:
+                if os.path.exists(self.mask_pathname) and self.load_mask:
+                    self.mask = self.load_image(self.mask_pathname, crop, jitter_amount, flip)
+                else:
+                    if self.variant_warning == False:
+                        print(f" {bcolors.FAIL} ** Warning: No mask found for an image, using an empty mask but make sure you're training the right model variant.{bcolors.ENDC}")
+                        self.variant_warning = True
+                    self.mask = Image.new('RGB', self.image.size, color="white").convert("L")
+            if self.model_variant == "depth2img":
+                if os.path.exists(self.depth_pathname):
+                    self.extra = self.load_image(self.depth_pathname, crop, jitter_amount, flip)
+                else:
+                    if self.variant_warning == False:
+                        print(f" {bcolors.FAIL} ** Warning: No depth found for an image, using an empty depth but make sure you're training the right model variant.{bcolors.ENDC}")
+                        self.variant_warning = True
+                    self.extra = Image.new('RGB', self.image.size, color="white").convert("L")
+        if type(self.image) is not np.ndarray:
+            if save:
+                base_name = os.path.basename(self.pathname)
+                if not os.path.exists("test/output"):
+                    os.makedirs("test/output")
+                self.image.save(f"test/output/{base_name}")
+            self.image = np.array(self.image).astype(np.uint8)
+            self.image = (self.image / 127.5 - 1.0).astype(np.float32)
+        if self.mask is not None and type(self.mask) is not np.ndarray:
+            self.mask = np.array(self.mask).astype(np.uint8)
+            self.mask = (self.mask / 255.0).astype(np.float32)
+        if self.extra is not None and type(self.extra) is not np.ndarray:
+            self.extra = np.array(self.extra).astype(np.uint8)
+            self.extra = (self.extra / 255.0).astype(np.float32)
+        #print(self.image.shape)
+        return self
+class CachedLatentsDataset(Dataset):
+    #stores paths and loads latents on the fly
+    def __init__(self, cache_paths=(),batch_size=None,tokenizer=None,text_encoder=None,dtype=None,model_variant='base',shuffle_per_epoch=False,args=None):
+        self.cache_paths = cache_paths
+        self.tokenizer = tokenizer
+        self.args = args
+        self.text_encoder = text_encoder
+        #get text encoder device
+        text_encoder_device = next(self.text_encoder.parameters()).device
+        self.empty_batch = [self.tokenizer('',padding="do_not_pad",truncation=True,max_length=self.tokenizer.model_max_length,).input_ids for i in range(batch_size)]
+        #handle text encoder for empty tokens
+        if self.args.train_text_encoder != True:
+            self.empty_tokens = tokenizer.pad({"input_ids": self.empty_batch},padding="max_length",max_length=tokenizer.model_max_length,return_tensors="pt",).to(text_encoder_device).input_ids
+            self.empty_tokens.to(text_encoder_device, dtype=dtype)
+            self.empty_tokens = self.text_encoder(self.empty_tokens)[0]
+        else:
+            self.empty_tokens = tokenizer.pad({"input_ids": self.empty_batch},padding="max_length",max_length=tokenizer.model_max_length,return_tensors="pt",).input_ids
+            self.empty_tokens.to(text_encoder_device, dtype=dtype)
+        self.conditional_dropout = args.conditional_dropout
+        self.conditional_indexes = []
+        self.model_variant = model_variant
+        self.shuffle_per_epoch = shuffle_per_epoch
+    def __len__(self):
+        return len(self.cache_paths)
+    def __getitem__(self, index):
+        if index == 0:
+            if self.shuffle_per_epoch == True:
+                self.cache_paths = tuple(random.sample(self.cache_paths, len(self.cache_paths)))
+            if len(self.cache_paths) > 1:
+                possible_indexes_extension = None
+                possible_indexes = list(range(0,len(self.cache_paths)))
+                #conditional dropout is a percentage of images to drop from the total cache_paths
+                if self.conditional_dropout != None:
+                    if len(self.conditional_indexes) == 0:
+                        self.conditional_indexes = random.sample(possible_indexes, k=int(math.ceil(len(possible_indexes)*self.conditional_dropout)))
+                    else:
+                        #pick indexes from the remaining possible indexes
+                        possible_indexes_extension = [i for i in possible_indexes if i not in self.conditional_indexes]
+                        #duplicate all values in possible_indexes_extension
+                        possible_indexes_extension = possible_indexes_extension + possible_indexes_extension
+                        possible_indexes_extension = possible_indexes_extension + self.conditional_indexes
+                        self.conditional_indexes = random.sample(possible_indexes_extension, k=int(math.ceil(len(possible_indexes)*self.conditional_dropout)))
+                        #check for duplicates in conditional_indexes values
+                        if len(self.conditional_indexes) != len(set(self.conditional_indexes)):
+                            #remove duplicates
+                            self.conditional_indexes_non_dupe = list(set(self.conditional_indexes))
+                            #add a random value from possible_indexes_extension for each duplicate
+                            for i in range(len(self.conditional_indexes) - len(self.conditional_indexes_non_dupe)):
+                                while True:
+                                    random_value = random.choice(possible_indexes_extension)
+                                    if random_value not in self.conditional_indexes_non_dupe:
+                                        self.conditional_indexes_non_dupe.append(random_value)
+                                        break
+                            self.conditional_indexes = self.conditional_indexes_non_dupe
+        self.cache = torch.load(self.cache_paths[index])
+        self.latents = self.cache.latents_cache[0]
+        self.tokens = self.cache.tokens_cache[0]
+        self.extra_cache = None
+        self.mask_cache = None
+        if self.cache.mask_cache is not None:
+            self.mask_cache = self.cache.mask_cache[0]
+        self.mask_mean_cache = None
+        if self.cache.mask_mean_cache is not None:
+            self.mask_mean_cache = self.cache.mask_mean_cache[0]
+        if index in self.conditional_indexes:
+            self.text_encoder = self.empty_tokens
+        else:
+            self.text_encoder = self.cache.text_encoder_cache[0]
+        if self.model_variant != 'base':
+            self.extra_cache = self.cache.extra_cache[0]
+        del self.cache
+        return self.latents, self.text_encoder, self.mask_cache, self.mask_mean_cache, self.extra_cache, self.tokens
+    def add_pt_cache(self, cache_path):
+        if len(self.cache_paths) == 0:
+            self.cache_paths = (cache_path,)
+        else:
+            self.cache_paths += (cache_path,)
+class LatentsDataset(Dataset):
+    def __init__(self, latents_cache=None, text_encoder_cache=None, mask_cache=None, mask_mean_cache=None, extra_cache=None,tokens_cache=None):
+        self.latents_cache = latents_cache
+        self.text_encoder_cache = text_encoder_cache
+        self.mask_cache = mask_cache
+        self.mask_mean_cache = mask_mean_cache
+        self.extra_cache = extra_cache
+        self.tokens_cache = tokens_cache
+    def add_latent(self, latent, text_encoder, cached_mask, cached_extra, tokens_cache):
+        self.latents_cache.append(latent)
+        self.text_encoder_cache.append(text_encoder)
+        self.mask_cache.append(cached_mask)
+        self.mask_mean_cache.append(None if cached_mask is None else cached_mask.mean())
+        self.extra_cache.append(cached_extra)
+        self.tokens_cache.append(tokens_cache)
+    def __len__(self):
+        return len(self.latents_cache)
+    def __getitem__(self, index):
+        return self.latents_cache[index], self.text_encoder_cache[index], self.mask_cache[index], self.mask_mean_cache[index], self.extra_cache[index], self.tokens_cache[index]
+class DataLoaderMultiAspect():
+    """
+    Data loader for multi-aspect-ratio training and bucketing
+    data_root: root folder of training data
+    batch_size: number of images per batch
+    flip_p: probability of flipping image horizontally (i.e. 0-0.5)
+    """
+    def __init__(
+            self,
+            concept_list,
+            seed=555,
+            debug_level=0,
+            resolution=512,
+            batch_size=1,
+            flip_p=0.0,
+            use_image_names_as_captions=True,
+            add_class_images_to_dataset=False,
+            balance_datasets=False,
+            with_prior_loss=False,
+            use_text_files_as_captions=False,
+            aspect_mode='dynamic',
+            action_preference='add',
+            model_variant='base',
+            extra_module=None,
+            mask_prompts=None,
+            load_mask=False,
+    ):
+        self.resolution = resolution
+        self.debug_level = debug_level
+        self.flip_p = flip_p
+        self.use_image_names_as_captions = use_image_names_as_captions
+        self.balance_datasets = balance_datasets
+        self.with_prior_loss = with_prior_loss
+        self.add_class_images_to_dataset = add_class_images_to_dataset
+        self.use_text_files_as_captions = use_text_files_as_captions
+        self.aspect_mode = aspect_mode
+        self.action_preference = action_preference
+        self.seed = seed
+        self.model_variant = model_variant
+        self.extra_module = extra_module
+        self.load_mask = load_mask
+        prepared_train_data = []
+        self.aspects = get_aspect_buckets(resolution)
+        #print(f"* DLMA resolution {resolution}, buckets: {self.aspects}")
+        #process sub directories flag
+        print(f" {bcolors.WARNING} Preloading images...{bcolors.ENDC}")
+        if balance_datasets:
+            print(f" {bcolors.WARNING} Balancing datasets...{bcolors.ENDC}")
+            #get the concept with the least number of images in instance_data_dir
+            for concept in concept_list:
+                count = 0
+                if 'use_sub_dirs' in concept:
+                    if concept['use_sub_dirs'] == 1:
+                        tot = 0
+                        for root, dirs, files in os.walk(concept['instance_data_dir']):
+                            tot += len(files)
+                        count = tot
+                    else:
+                        count = len(os.listdir(concept['instance_data_dir']))
+                else:
+                    count = len(os.listdir(concept['instance_data_dir']))
+                print(f"{concept['instance_data_dir']} has count of {count}")
+                concept['count'] = count
+            min_concept = min(concept_list, key=lambda x: x['count'])
+            #get the number of images in the concept with the least number of images
+            min_concept_num_images = min_concept['count']
+            print(" Min concept: ",min_concept['instance_data_dir']," with ",min_concept_num_images," images")
+            balance_cocnept_list = []
+            for concept in concept_list:
+                #if concept has a key do not balance it
+                if 'do_not_balance' in concept:
+                    if concept['do_not_balance'] == True:
+                        balance_cocnept_list.append(-1)
+                    else:
+                        balance_cocnept_list.append(min_concept_num_images)
+                else:
+                        balance_cocnept_list.append(min_concept_num_images)
+        for concept in concept_list:
+            if 'use_sub_dirs' in concept:
+                if concept['use_sub_dirs'] == True:
+                    use_sub_dirs = True
+                else:
+                    use_sub_dirs = False
+            else:
+                use_sub_dirs = False
+            self.image_paths = []
+            #self.class_image_paths = []
+            min_concept_num_images = None
+            if balance_datasets:
+                min_concept_num_images = balance_cocnept_list[concept_list.index(concept)]
+            data_root = concept['instance_data_dir']
+            data_root_class = concept['class_data_dir']
+            concept_prompt = concept['instance_prompt']
+            concept_class_prompt = concept['class_prompt']
+            if 'flip_p' in concept.keys():
+                flip_p = concept['flip_p']
+                if flip_p == '':
+                    flip_p = 0.0
+                else:
+                    flip_p = float(flip_p)
+            self.__recurse_data_root(self=self, recurse_root=data_root,use_sub_dirs=use_sub_dirs)
+            random.Random(self.seed).shuffle(self.image_paths)
+            if self.model_variant == 'depth2img':
+                print(f" {bcolors.WARNING} ** Loading Depth2Img Pipeline To Process Dataset{bcolors.ENDC}")
+                self.vae_scale_factor = self.extra_module.depth_images(self.image_paths)
+            prepared_train_data.extend(self.__prescan_images(debug_level, self.image_paths, flip_p,use_image_names_as_captions,concept_prompt,use_text_files_as_captions=self.use_text_files_as_captions)[0:min_concept_num_images]) # ImageTrainItem[]
+            if add_class_images_to_dataset:
+                self.image_paths = []
+                self.__recurse_data_root(self=self, recurse_root=data_root_class,use_sub_dirs=use_sub_dirs)
+                random.Random(self.seed).shuffle(self.image_paths)
+                use_image_names_as_captions = False
+                prepared_train_data.extend(self.__prescan_images(debug_level, self.image_paths, flip_p,use_image_names_as_captions,concept_class_prompt,use_text_files_as_captions=self.use_text_files_as_captions)) # ImageTrainItem[]
+        self.image_caption_pairs = self.__bucketize_images(prepared_train_data, batch_size=batch_size, debug_level=debug_level,aspect_mode=self.aspect_mode,action_preference=self.action_preference)
+        if self.with_prior_loss and add_class_images_to_dataset == False:
+            self.class_image_caption_pairs = []
+            for concept in concept_list:
+                self.class_images_path = []
+                data_root_class = concept['class_data_dir']
+                concept_class_prompt = concept['class_prompt']
+                self.__recurse_data_root(self=self, recurse_root=data_root_class,use_sub_dirs=use_sub_dirs,class_images=True)
+                random.Random(seed).shuffle(self.image_paths)
+                if self.model_variant == 'depth2img':
+                    print(f" {bcolors.WARNING} ** Depth2Img To Process Class Dataset{bcolors.ENDC}")
+                    self.vae_scale_factor = self.extra_module.depth_images(self.image_paths)
+                use_image_names_as_captions = False
+                self.class_image_caption_pairs.extend(self.__prescan_images(debug_level, self.class_images_path, flip_p,use_image_names_as_captions,concept_class_prompt,use_text_files_as_captions=self.use_text_files_as_captions))
+            self.class_image_caption_pairs = self.__bucketize_images(self.class_image_caption_pairs, batch_size=batch_size, debug_level=debug_level,aspect_mode=self.aspect_mode,action_preference=self.action_preference)
+        if mask_prompts is not None:
+            print(f" {bcolors.WARNING} Checking and generating missing masks...{bcolors.ENDC}")
+            clip_seg = ClipSeg()
+            clip_seg.mask_images(self.image_paths, mask_prompts)
+            del clip_seg
+        if debug_level > 0: print(f" * DLMA Example: {self.image_caption_pairs[0]} images")
+        #print the length of image_caption_pairs
+        print(f" {bcolors.WARNING} Number of image-caption pairs: {len(self.image_caption_pairs)}{bcolors.ENDC}")
+        if len(self.image_caption_pairs) == 0:
+            raise Exception("All the buckets are empty. Please check your data or reduce the batch size.")
+    def get_all_images(self):
+        if self.with_prior_loss == False:
+            return self.image_caption_pairs
+        else:
+            return self.image_caption_pairs, self.class_image_caption_pairs
+    def __prescan_images(self,debug_level: int, image_paths: list, flip_p=0.0,use_image_names_as_captions=True,concept=None,use_text_files_as_captions=False):
+        """
+        Create ImageTrainItem objects with metadata for hydration later
+        """
+        decorated_image_train_items = []
+        for pathname in image_paths:
+            identifier = concept
+            if use_image_names_as_captions:
+                caption_from_filename = os.path.splitext(os.path.basename(pathname))[0].split("_")[0]
+                identifier = caption_from_filename
+            if use_text_files_as_captions:
+                txt_file_path = os.path.splitext(pathname)[0] + ".txt"
+                if os.path.exists(txt_file_path):
+                    try:
+                        with open(txt_file_path, 'r',encoding='utf-8',errors='ignore') as f:
+                            identifier = f.readline().rstrip()
+                            f.close()
+                            if len(identifier) < 1:
+                                raise ValueError(f" *** Could not find valid text in: {txt_file_path}")
+                    except Exception as e:
+                        print(f" {bcolors.FAIL} *** Error reading {txt_file_path} to get caption, falling back to filename{bcolors.ENDC}")
+                        print(e)
+                        identifier = caption_from_filename
+                        pass
+            #print("identifier: ",identifier)
+            image = Image.open(pathname)
+            width, height = image.size
+            image_aspect = width / height
+            target_wh = min(self.aspects, key=lambda aspects:abs(aspects[0]/aspects[1] - image_aspect))
+            image_train_item = ImageTrainItem(image=None, mask=None, extra=None, caption=identifier, target_wh=target_wh, pathname=pathname, flip_p=flip_p,model_variant=self.model_variant, load_mask=self.load_mask)
+            decorated_image_train_items.append(image_train_item)
+        return decorated_image_train_items
+    @staticmethod
+    def __bucketize_images(prepared_train_data: list, batch_size=1, debug_level=0,aspect_mode='dynamic',action_preference='add'):
+        """
+        Put images into buckets based on aspect ratio with batch_size*n images per bucket, discards remainder
+        """
+        # TODO: this is not terribly efficient but at least linear time
+        buckets = {}
+        for image_caption_pair in prepared_train_data:
+            target_wh = image_caption_pair.target_wh
+            if (target_wh[0],target_wh[1]) not in buckets:
+                buckets[(target_wh[0],target_wh[1])] = []
+            buckets[(target_wh[0],target_wh[1])].append(image_caption_pair)
+        print(f" ** Number of buckets: {len(buckets)}")
+        for bucket in buckets:
+            bucket_len = len(buckets[bucket])
+            #real_len = len(buckets[bucket])+1
+            #print(real_len)
+            truncate_amount = bucket_len % batch_size
+            add_amount = batch_size - bucket_len % batch_size
+            action = None
+            #print(f" ** Bucket {bucket} has {bucket_len} images")
+            if aspect_mode == 'dynamic':
+                if batch_size == bucket_len:
+                    action = None
+                elif add_amount < truncate_amount and add_amount != 0 and add_amount != batch_size or truncate_amount == 0:
+                    action = 'add'
+                    #print(f'should add {add_amount}')
+                elif truncate_amount < add_amount and truncate_amount != 0 and truncate_amount != batch_size and batch_size < bucket_len:
+                    #print(f'should truncate {truncate_amount}')
+                    action = 'truncate'
+                    #truncate the bucket
+                elif truncate_amount == add_amount:
+                    if action_preference == 'add':
+                        action = 'add'
+                    elif action_preference == 'truncate':
+                        action = 'truncate'
+                elif batch_size > bucket_len:
+                    action = 'add'
+            elif aspect_mode == 'add':
+                action = 'add'
+            elif aspect_mode == 'truncate':
+                action = 'truncate'
+            if action == None:
+                action = None
+                #print('no need to add or truncate')
+            if action == None:
+                #print('test')
+                current_bucket_size = bucket_len
+                print(f"  ** Bucket {bucket} found {bucket_len}, nice!")
+            elif action == 'add':
+                #copy the bucket
+                shuffleBucket = random.sample(buckets[bucket], bucket_len)
+                #add the images to the bucket
+                current_bucket_size = bucket_len
+                truncate_count = (bucket_len) % batch_size
+                #how many images to add to the bucket to fill the batch
+                addAmount = batch_size - truncate_count
+                if addAmount != batch_size:
+                    added=0
+                    while added != addAmount:
+                        randomIndex = random.randint(0,len(shuffleBucket)-1)
+                        #print(str(randomIndex))
+                        buckets[bucket].append(shuffleBucket[randomIndex])
+                        added+=1
+                    print(f"  ** Bucket {bucket} found {bucket_len} images, will {bcolors.OKCYAN}duplicate {added} images{bcolors.ENDC} due to batch size {bcolors.WARNING}{batch_size}{bcolors.ENDC}")
+                else:
+                    print(f"  ** Bucket {bucket} found {bucket_len}, {bcolors.OKGREEN}nice!{bcolors.ENDC}")
+            elif action == 'truncate':
+                truncate_count = (bucket_len) % batch_size
+                current_bucket_size = bucket_len
+                buckets[bucket] = buckets[bucket][:current_bucket_size - truncate_count]
+                print(f"  ** Bucket {bucket} found {bucket_len} images, will {bcolors.FAIL}drop {truncate_count} images{bcolors.ENDC} due to batch size {bcolors.WARNING}{batch_size}{bcolors.ENDC}")
+        # flatten the buckets
+        image_caption_pairs = []
+        for bucket in buckets:
+            image_caption_pairs.extend(buckets[bucket])
+        return image_caption_pairs
+    @staticmethod
+    def __recurse_data_root(self, recurse_root,use_sub_dirs=True,class_images=False):
+        progress_bar = tqdm(os.listdir(recurse_root), desc=f" {bcolors.WARNING} ** Processing {recurse_root}{bcolors.ENDC}")
+        for f in os.listdir(recurse_root):
+            current = os.path.join(recurse_root, f)
+            if os.path.isfile(current):
+                ext = os.path.splitext(f)[1].lower()
+                if '-depth' in f or '-masklabel' in f:
+                    progress_bar.update(1)
+                    continue
+                if ext in ['.jpg', '.jpeg', '.png', '.bmp', '.webp']:
+                    #try to open the file to make sure it's a valid image
+                    try:
+                        img = Image.open(current)
+                    except:
+                        print(f" ** Skipping {current} because it failed to open, please check the file")
+                        progress_bar.update(1)
+                        continue
+                    del img
+                    if class_images == False:
+                        self.image_paths.append(current)
+                    else:
+                        self.class_images_path.append(current)
+            progress_bar.update(1)
+        if use_sub_dirs:
+            sub_dirs = []
+            for d in os.listdir(recurse_root):
+                current = os.path.join(recurse_root, d)
+                if os.path.isdir(current):
+                    sub_dirs.append(current)
+            for dir in sub_dirs:
+                self.__recurse_data_root(self=self, recurse_root=dir)
+class NormalDataset(Dataset):
+    """
+    A dataset to prepare the instance and class images with the prompts for fine-tuning the model.
+    It pre-processes the images and the tokenizes prompts.
+    """
+    def __init__(
+        self,
+        concepts_list,
+        tokenizer,
+        with_prior_preservation=True,
+        size=512,
+        center_crop=False,
+        num_class_images=None,
+        use_image_names_as_captions=False,
+        shuffle_captions=False,
+        repeats=1,
+        use_text_files_as_captions=False,
+        seed=555,
+        model_variant='base',
+        extra_module=None,
+        mask_prompts=None,
+        load_mask=None,
+    ):
+        self.use_image_names_as_captions = use_image_names_as_captions
+        self.shuffle_captions = shuffle_captions
+        self.size = size
+        self.center_crop = center_crop
+        self.tokenizer = tokenizer
+        self.with_prior_preservation = with_prior_preservation
+        self.use_text_files_as_captions = use_text_files_as_captions
+        self.image_paths = []
+        self.class_images_path = []
+        self.seed = seed
+        self.model_variant = model_variant
+        self.variant_warning = False
+        self.vae_scale_factor = None
+        self.load_mask = load_mask
+        for concept in concepts_list:
+            if 'use_sub_dirs' in concept:
+                if concept['use_sub_dirs'] == True:
+                    use_sub_dirs = True
+                else:
+                    use_sub_dirs = False
+            else:
+                use_sub_dirs = False
+            for i in range(repeats):
+                self.__recurse_data_root(self, concept,use_sub_dirs=use_sub_dirs)
+            if with_prior_preservation:
+                for i in range(repeats):
+                    self.__recurse_data_root(self, concept,use_sub_dirs=False,class_images=True)
+        if mask_prompts is not None:
+            print(f" {bcolors.WARNING} Checking and generating missing masks{bcolors.ENDC}")
+            clip_seg = ClipSeg()
+            clip_seg.mask_images(self.image_paths, mask_prompts)
+            del clip_seg
+        random.Random(seed).shuffle(self.image_paths)
+        self.num_instance_images = len(self.image_paths)
+        self._length = self.num_instance_images
+        self.num_class_images = len(self.class_images_path)
+        self._length = max(self.num_class_images, self.num_instance_images)
+        if self.model_variant == 'depth2img':
+            print(f" {bcolors.WARNING} ** Loading Depth2Img Pipeline To Process Dataset{bcolors.ENDC}")
+            self.vae_scale_factor = extra_module.depth_images(self.image_paths)
+            if self.with_prior_preservation:
+                print(f" {bcolors.WARNING} ** Loading Depth2Img Class Processing{bcolors.ENDC}")
+                extra_module.depth_images(self.class_images_path)
+        print(f" {bcolors.WARNING} ** Dataset length: {self._length}, {int(self.num_instance_images / repeats)} images using {repeats} repeats{bcolors.ENDC}")
+        self.image_transforms = transforms.Compose(
+            [
+                transforms.Resize(size, interpolation=transforms.InterpolationMode.BILINEAR),
+                transforms.CenterCrop(size) if center_crop else transforms.RandomCrop(size),
+                transforms.ToTensor(),
+                transforms.Normalize([0.5], [0.5]),
+            ]
+        )
+        self.mask_transforms = transforms.Compose(
+            [
+                transforms.Resize(size, interpolation=transforms.InterpolationMode.BILINEAR),
+                transforms.CenterCrop(size) if center_crop else transforms.RandomCrop(size),
+                transforms.ToTensor(),
+            ])
+        self.depth_image_transforms = transforms.Compose(
+            [
+                transforms.Resize(size, interpolation=transforms.InterpolationMode.BILINEAR),
+                transforms.CenterCrop(size) if center_crop else transforms.RandomCrop(size),
+                transforms.ToTensor(),
+            ]
+        )
+    @staticmethod
+    def __recurse_data_root(self, recurse_root,use_sub_dirs=True,class_images=False):
+        #if recurse root is a dict
+        if isinstance(recurse_root, dict):
+            if class_images == True:
+                #print(f" {bcolors.WARNING} ** Processing class images: {recurse_root['class_data_dir']}{bcolors.ENDC}")
+                concept_token = recurse_root['class_prompt']
+                data = recurse_root['class_data_dir']
+            else:
+                #print(f" {bcolors.WARNING} ** Processing instance images: {recurse_root['instance_data_dir']}{bcolors.ENDC}")
+                concept_token = recurse_root['instance_prompt']
+                data = recurse_root['instance_data_dir']
+        else:
+            concept_token = None
+        #progress bar
+        progress_bar = tqdm(os.listdir(data), desc=f" {bcolors.WARNING} ** Processing {data}{bcolors.ENDC}")
+        for f in os.listdir(data):
+            current = os.path.join(data, f)
+            if os.path.isfile(current):
+                if '-depth' in f or '-masklabel' in f:
+                    continue
+                ext = os.path.splitext(f)[1].lower()
+                if ext in ['.jpg', '.jpeg', '.png', '.bmp', '.webp']:
+                    try:
+                        img = Image.open(current)
+                    except:
+                        print(f" ** Skipping {current} because it failed to open, please check the file")
+                        progress_bar.update(1)
+                        continue
+                    del img
+                    if class_images == False:
+                        self.image_paths.append([current,concept_token])
+                    else:
+                        self.class_images_path.append([current,concept_token])
+            progress_bar.update(1)
+        if use_sub_dirs:
+            sub_dirs = []
+            for d in os.listdir(data):
+                current = os.path.join(data, d)
+                if os.path.isdir(current):
+                    sub_dirs.append(current)
+            for dir in sub_dirs:
+                if class_images == False:
+                    self.__recurse_data_root(self=self, recurse_root={'instance_data_dir' : dir, 'instance_prompt' : concept_token})
+                else:
+                    self.__recurse_data_root(self=self, recurse_root={'class_data_dir' : dir, 'class_prompt' : concept_token})
+    def __len__(self):
+        return self._length
+    def __getitem__(self, index):
+        example = {}
+        instance_path, instance_prompt = self.image_paths[index % self.num_instance_images]
+        og_prompt = instance_prompt
+        instance_image = Image.open(instance_path)
+        if self.model_variant == "inpainting" or self.load_mask:
+            mask_pathname = os.path.splitext(instance_path)[0] + "-masklabel.png"
+            if os.path.exists(mask_pathname) and self.load_mask:
+                mask = Image.open(mask_pathname).convert("L")
+            else:
+                if self.variant_warning == False:
+                    print(f" {bcolors.FAIL} ** Warning: No mask found for an image, using an empty mask but make sure you're training the right model variant.{bcolors.ENDC}")
+                    self.variant_warning = True
+                size = instance_image.size
+                mask = Image.new('RGB', size, color="white").convert("L")
+            example["mask"] = self.mask_transforms(mask)
+        if self.model_variant == "depth2img":
+            depth_pathname = os.path.splitext(instance_path)[0] + "-depth.png"
+            if os.path.exists(depth_pathname):
+                depth_image = Image.open(depth_pathname).convert("L")
+            else:
+                if self.variant_warning == False:
+                    print(f" {bcolors.FAIL} ** Warning: No depth image found for an image, using an empty depth image but make sure you're training the right model variant.{bcolors.ENDC}")
+                    self.variant_warning = True
+                size = instance_image.size
+                depth_image = Image.new('RGB', size, color="white").convert("L")
+            example["instance_depth_images"] = self.depth_image_transforms(depth_image)
+        if self.use_image_names_as_captions == True:
+            instance_prompt = str(instance_path).split(os.sep)[-1].split('.')[0].split('_')[0]
+        #else if there's a txt file with the same name as the image, read the caption from there
+        if self.use_text_files_as_captions == True:
+            #if there's a file with the same name as the image, but with a .txt extension, read the caption from there
+            #get the last . in the file name
+            last_dot = str(instance_path).rfind('.')
+            #get the path up to the last dot
+            txt_path = str(instance_path)[:last_dot] + '.txt'
+            #if txt_path exists, read the caption from there
+            if os.path.exists(txt_path):
+                with open(txt_path, encoding='utf-8') as f:
+                    instance_prompt = f.readline().rstrip()
+                    f.close()
+        if self.shuffle_captions:
+            caption_parts = instance_prompt.split(",")
+            random.shuffle(caption_parts)
+            instance_prompt = ",".join(caption_parts)
+        #print('identifier: ' + instance_prompt)
+        instance_image = instance_image.convert("RGB")
+        example["instance_images"] = self.image_transforms(instance_image)
+        example["instance_prompt_ids"] = self.tokenizer(
+            instance_prompt,
+            padding="do_not_pad",
+            truncation=True,
+            max_length=self.tokenizer.model_max_length,
+        ).input_ids
+        if self.with_prior_preservation:
+            class_path, class_prompt = self.class_images_path[index % self.num_class_images]
+            class_image = Image.open(class_path)
+            if not class_image.mode == "RGB":
+                class_image = class_image.convert("RGB")
+            if self.model_variant == "inpainting":
+                mask_pathname = os.path.splitext(class_path)[0] + "-masklabel.png"
+                if os.path.exists(mask_pathname):
+                    mask = Image.open(mask_pathname).convert("L")
+                else:
+                    if self.variant_warning == False:
+                        print(f" {bcolors.FAIL} ** Warning: No mask found for an image, using an empty mask but make sure you're training the right model variant.{bcolors.ENDC}")
+                        self.variant_warning = True
+                    size = instance_image.size
+                    mask = Image.new('RGB', size, color="white").convert("L")
+                example["class_mask"] = self.mask_transforms(mask)
+            if self.model_variant == "depth2img":
+                depth_pathname = os.path.splitext(class_path)[0] + "-depth.png"
+                if os.path.exists(depth_pathname):
+                    depth_image = Image.open(depth_pathname)
+                else:
+                    if self.variant_warning == False:
+                        print(f" {bcolors.FAIL} ** Warning: No depth image found for an image, using an empty depth image but make sure you're training the right model variant.{bcolors.ENDC}")
+                        self.variant_warning = True
+                    size = instance_image.size
+                    depth_image = Image.new('RGB', size, color="white").convert("L")
+                example["class_depth_images"] = self.depth_image_transforms(depth_image)
+            example["class_images"] = self.image_transforms(class_image)
+            example["class_prompt_ids"] = self.tokenizer(
+                class_prompt,
+                padding="do_not_pad",
+                truncation=True,
+                max_length=self.tokenizer.model_max_length,
+            ).input_ids
+        return example
+class PromptDataset(Dataset):
+    "A simple dataset to prepare the prompts to generate class images on multiple GPUs."
+    def __init__(self, prompt, num_samples):
+        self.prompt = prompt
+        self.num_samples = num_samples
+    def __len__(self):
+        return self.num_samples
+    def __getitem__(self, index):
+        example = {}
+        example["prompt"] = self.prompt
+        example["index"] = index
+        return example

StableTuner_RunPod_Fix/discriminator.py ADDED Viewed

	@@ -0,0 +1,764 @@

+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import einops, einops.layers.torch
+import diffusers
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from typing import Tuple, Optional
+import inspect
+import os
+from functools import partial
+from typing import Callable, List, Optional, Tuple, Union
+import torch
+from torch import Tensor, device
+class ModelMixin(torch.nn.Module):
+    r"""
+    Base class for all models.
+    [`ModelMixin`] takes care of storing the configuration of the models and handles methods for loading, downloading
+    and saving models.
+        - **config_name** ([`str`]) -- A filename under which the model should be stored when calling
+          [`~models.ModelMixin.save_pretrained`].
+    """
+    config_name = "new"
+    _automatically_saved_args = ["_diffusers_version", "_class_name", "_name_or_path"]
+    _supports_gradient_checkpointing = False
+    def __init__(self):
+        super().__init__()
+    @property
+    def is_gradient_checkpointing(self) -> bool:
+        """
+        Whether gradient checkpointing is activated for this model or not.
+        Note that in other frameworks this feature can be referred to as "activation checkpointing" or "checkpoint
+        activations".
+        """
+        return any(hasattr(m, "gradient_checkpointing") and m.gradient_checkpointing for m in self.modules())
+    def enable_gradient_checkpointing(self):
+        """
+        Activates gradient checkpointing for the current model.
+        Note that in other frameworks this feature can be referred to as "activation checkpointing" or "checkpoint
+        activations".
+        """
+        if not self._supports_gradient_checkpointing:
+            raise ValueError(f"{self.__class__.__name__} does not support gradient checkpointing.")
+        self.apply(partial(self._set_gradient_checkpointing, value=True))
+    def disable_gradient_checkpointing(self):
+        """
+        Deactivates gradient checkpointing for the current model.
+        Note that in other frameworks this feature can be referred to as "activation checkpointing" or "checkpoint
+        activations".
+        """
+        if self._supports_gradient_checkpointing:
+            self.apply(partial(self._set_gradient_checkpointing, value=False))
+    def set_use_memory_efficient_attention_xformers(
+        self, valid: bool, attention_op: Optional[Callable] = None
+    ) -> None:
+        # Recursively walk through all the children.
+        # Any children which exposes the set_use_memory_efficient_attention_xformers method
+        # gets the message
+        def fn_recursive_set_mem_eff(module: torch.nn.Module):
+            if hasattr(module, "set_use_memory_efficient_attention_xformers"):
+                module.set_use_memory_efficient_attention_xformers(valid, attention_op)
+            for child in module.children():
+                fn_recursive_set_mem_eff(child)
+        for module in self.children():
+            if isinstance(module, torch.nn.Module):
+                fn_recursive_set_mem_eff(module)
+    def enable_xformers_memory_efficient_attention(self, attention_op: Optional[Callable] = None):
+        r"""
+        Enable memory efficient attention as implemented in xformers.
+        When this option is enabled, you should observe lower GPU memory usage and a potential speed up at inference
+        time. Speed up at training time is not guaranteed.
+        Warning: When Memory Efficient Attention and Sliced attention are both enabled, the Memory Efficient Attention
+        is used.
+        Parameters:
+            attention_op (`Callable`, *optional*):
+                Override the default `None` operator for use as `op` argument to the
+                [`memory_efficient_attention()`](https://facebookresearch.github.io/xformers/components/ops.html#xformers.ops.memory_efficient_attention)
+                function of xFormers.
+        Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import UNet2DConditionModel
+        >>> from xformers.ops import MemoryEfficientAttentionFlashAttentionOp
+        >>> model = UNet2DConditionModel.from_pretrained(
+        ...     "stabilityai/stable-diffusion-2-1", subfolder="unet", torch_dtype=torch.float16
+        ... )
+        >>> model = model.to("cuda")
+        >>> model.enable_xformers_memory_efficient_attention(attention_op=MemoryEfficientAttentionFlashAttentionOp)
+        ```
+        """
+        self.set_use_memory_efficient_attention_xformers(True, attention_op)
+    def disable_xformers_memory_efficient_attention(self):
+        r"""
+        Disable memory efficient attention as implemented in xformers.
+        """
+        self.set_use_memory_efficient_attention_xformers(False)
+    def save_pretrained(
+        self,
+        save_directory: Union[str, os.PathLike],
+        is_main_process: bool = True,
+        save_function: Callable = None,
+        safe_serialization: bool = False,
+        variant: Optional[str] = None,
+    ):
+        """
+        Save a model and its configuration file to a directory, so that it can be re-loaded using the
+        `[`~models.ModelMixin.from_pretrained`]` class method.
+        Arguments:
+            save_directory (`str` or `os.PathLike`):
+                Directory to which to save. Will be created if it doesn't exist.
+            is_main_process (`bool`, *optional*, defaults to `True`):
+                Whether the process calling this is the main process or not. Useful when in distributed training like
+                TPUs and need to call this function on all processes. In this case, set `is_main_process=True` only on
+                the main process to avoid race conditions.
+            save_function (`Callable`):
+                The function to use to save the state dictionary. Useful on distributed training like TPUs when one
+                need to replace `torch.save` by another method. Can be configured with the environment variable
+                `DIFFUSERS_SAVE_MODE`.
+            safe_serialization (`bool`, *optional*, defaults to `False`):
+                Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
+            variant (`str`, *optional*):
+                If specified, weights are saved in the format pytorch_model.<variant>.bin.
+        """
+        if safe_serialization and not is_safetensors_available():
+            raise ImportError("`safe_serialization` requires the `safetensors library: `pip install safetensors`.")
+        if os.path.isfile(save_directory):
+            logger.error(f"Provided path ({save_directory}) should be a directory, not a file")
+            return
+        os.makedirs(save_directory, exist_ok=True)
+        model_to_save = self
+        # Attach architecture to the config
+        # Save the config
+        if is_main_process:
+            model_to_save.save_config(save_directory)
+        # Save the model
+        state_dict = model_to_save.state_dict()
+        weights_name = SAFETENSORS_WEIGHTS_NAME if safe_serialization else WEIGHTS_NAME
+        weights_name = _add_variant(weights_name, variant)
+        # Save the model
+        if safe_serialization:
+            safetensors.torch.save_file(
+                state_dict, os.path.join(save_directory, weights_name), metadata={"format": "pt"}
+            )
+        else:
+            torch.save(state_dict, os.path.join(save_directory, weights_name))
+        logger.info(f"Model weights saved in {os.path.join(save_directory, weights_name)}")
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], **kwargs):
+        r"""
+        Instantiate a pretrained pytorch model from a pre-trained model configuration.
+        The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated). To train
+        the model, you should first set it back in training mode with `model.train()`.
+        The warning *Weights from XXX not initialized from pretrained model* means that the weights of XXX do not come
+        pretrained with the rest of the model. It is up to you to train those weights with a downstream fine-tuning
+        task.
+        The warning *Weights from XXX not used in YYY* means that the layer XXX is not used by YYY, therefore those
+        weights are discarded.
+        Parameters:
+            pretrained_model_name_or_path (`str` or `os.PathLike`, *optional*):
+                Can be either:
+                    - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
+                      Valid model ids should have an organization name, like `google/ddpm-celebahq-256`.
+                    - A path to a *directory* containing model weights saved using [`~ModelMixin.save_config`], e.g.,
+                      `./my_model_directory/`.
+            cache_dir (`Union[str, os.PathLike]`, *optional*):
+                Path to a directory in which a downloaded pretrained model configuration should be cached if the
+                standard cache should not be used.
+            torch_dtype (`str` or `torch.dtype`, *optional*):
+                Override the default `torch.dtype` and load the model under this dtype. If `"auto"` is passed the dtype
+                will be automatically derived from the model's weights.
+            force_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
+                cached versions if they exist.
+            resume_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to delete incompletely received files. Will attempt to resume the download if such a
+                file exists.
+            proxies (`Dict[str, str]`, *optional*):
+                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+            output_loading_info(`bool`, *optional*, defaults to `False`):
+                Whether or not to also return a dictionary containing missing keys, unexpected keys and error messages.
+            local_files_only(`bool`, *optional*, defaults to `False`):
+                Whether or not to only look at local files (i.e., do not try to download the model).
+            use_auth_token (`str` or *bool*, *optional*):
+                The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
+                when running `diffusers-cli login` (stored in `~/.huggingface`).
+            revision (`str`, *optional*, defaults to `"main"`):
+                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
+                git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
+                identifier allowed by git.
+            from_flax (`bool`, *optional*, defaults to `False`):
+                Load the model weights from a Flax checkpoint save file.
+            subfolder (`str`, *optional*, defaults to `""`):
+                In case the relevant files are located inside a subfolder of the model repo (either remote in
+                huggingface.co or downloaded locally), you can specify the folder name here.
+            mirror (`str`, *optional*):
+                Mirror source to accelerate downloads in China. If you are from China and have an accessibility
+                problem, you can set this option to resolve it. Note that we do not guarantee the timeliness or safety.
+                Please refer to the mirror site for more information.
+            device_map (`str` or `Dict[str, Union[int, str, torch.device]]`, *optional*):
+                A map that specifies where each submodule should go. It doesn't need to be refined to each
+                parameter/buffer name, once a given module name is inside, every submodule of it will be sent to the
+                same device.
+                To have Accelerate compute the most optimized `device_map` automatically, set `device_map="auto"`. For
+                more information about each option see [designing a device
+                map](https://hf.co/docs/accelerate/main/en/usage_guides/big_modeling#designing-a-device-map).
+            low_cpu_mem_usage (`bool`, *optional*, defaults to `True` if torch version >= 1.9.0 else `False`):
+                Speed up model loading by not initializing the weights and only loading the pre-trained weights. This
+                also tries to not use more than 1x model size in CPU memory (including peak memory) while loading the
+                model. This is only supported when torch version >= 1.9.0. If you are using an older version of torch,
+                setting this argument to `True` will raise an error.
+            variant (`str`, *optional*):
+                If specified load weights from `variant` filename, *e.g.* pytorch_model.<variant>.bin. `variant` is
+                ignored when using `from_flax`.
+            use_safetensors (`bool`, *optional* ):
+                If set to `True`, the pipeline will forcibly load the models from `safetensors` weights. If set to
+                `None` (the default). The pipeline will load using `safetensors` if safetensors weights are available
+                *and* if `safetensors` is installed. If the to `False` the pipeline will *not* use `safetensors`.
+        <Tip>
+         It is required to be logged in (`huggingface-cli login`) when you want to use private or [gated
+         models](https://huggingface.co/docs/hub/models-gated#gated-models).
+        </Tip>
+        <Tip>
+        Activate the special ["offline-mode"](https://huggingface.co/diffusers/installation.html#offline-mode) to use
+        this method in a firewalled environment.
+        </Tip>
+        """
+        cache_dir = kwargs.pop("cache_dir", DIFFUSERS_CACHE)
+        ignore_mismatched_sizes = kwargs.pop("ignore_mismatched_sizes", False)
+        force_download = kwargs.pop("force_download", False)
+        from_flax = kwargs.pop("from_flax", False)
+        resume_download = kwargs.pop("resume_download", False)
+        proxies = kwargs.pop("proxies", None)
+        output_loading_info = kwargs.pop("output_loading_info", False)
+        local_files_only = kwargs.pop("local_files_only", HF_HUB_OFFLINE)
+        use_auth_token = kwargs.pop("use_auth_token", None)
+        revision = kwargs.pop("revision", None)
+        torch_dtype = kwargs.pop("torch_dtype", None)
+        subfolder = kwargs.pop("subfolder", None)
+        device_map = kwargs.pop("device_map", None)
+        low_cpu_mem_usage = kwargs.pop("low_cpu_mem_usage", _LOW_CPU_MEM_USAGE_DEFAULT)
+        variant = kwargs.pop("variant", None)
+        use_safetensors = kwargs.pop("use_safetensors", None)
+        if use_safetensors and not is_safetensors_available():
+            raise ValueError(
+                "`use_safetensors`=True but safetensors is not installed. Please install safetensors with `pip install safetenstors"
+            )
+        allow_pickle = False
+        if use_safetensors is None:
+            use_safetensors = is_safetensors_available()
+            allow_pickle = True
+        if low_cpu_mem_usage and not is_accelerate_available():
+            low_cpu_mem_usage = False
+            logger.warning(
+                "Cannot initialize model with low cpu memory usage because `accelerate` was not found in the"
+                " environment. Defaulting to `low_cpu_mem_usage=False`. It is strongly recommended to install"
+                " `accelerate` for faster and less memory-intense model loading. You can do so with: \n```\npip"
+                " install accelerate\n```\n."
+            )
+        if device_map is not None and not is_accelerate_available():
+            raise NotImplementedError(
+                "Loading and dispatching requires `accelerate`. Please make sure to install accelerate or set"
+                " `device_map=None`. You can install accelerate with `pip install accelerate`."
+            )
+        # Check if we can handle device_map and dispatching the weights
+        if device_map is not None and not is_torch_version(">=", "1.9.0"):
+            raise NotImplementedError(
+                "Loading and dispatching requires torch >= 1.9.0. Please either update your PyTorch version or set"
+                " `device_map=None`."
+            )
+        if low_cpu_mem_usage is True and not is_torch_version(">=", "1.9.0"):
+            raise NotImplementedError(
+                "Low memory initialization requires torch >= 1.9.0. Please either update your PyTorch version or set"
+                " `low_cpu_mem_usage=False`."
+            )
+        if low_cpu_mem_usage is False and device_map is not None:
+            raise ValueError(
+                f"You cannot set `low_cpu_mem_usage` to `False` while using device_map={device_map} for loading and"
+                " dispatching. Please make sure to set `low_cpu_mem_usage=True`."
+            )
+        # Load config if we don't provide a configuration
+        config_path = pretrained_model_name_or_path
+        user_agent = {
+            "diffusers": __version__,
+            "file_type": "model",
+            "framework": "pytorch",
+        }
+        # load config
+        config, unused_kwargs, commit_hash = cls.load_config(
+            config_path,
+            cache_dir=cache_dir,
+            return_unused_kwargs=True,
+            return_commit_hash=True,
+            force_download=force_download,
+            resume_download=resume_download,
+            proxies=proxies,
+            local_files_only=local_files_only,
+            use_auth_token=use_auth_token,
+            revision=revision,
+            subfolder=subfolder,
+            device_map=device_map,
+            user_agent=user_agent,
+            **kwargs,
+        )
+        # load model
+        model_file = None
+        if from_flax:
+            model_file = _get_model_file(
+                pretrained_model_name_or_path,
+                weights_name=FLAX_WEIGHTS_NAME,
+                cache_dir=cache_dir,
+                force_download=force_download,
+                resume_download=resume_download,
+                proxies=proxies,
+                local_files_only=local_files_only,
+                use_auth_token=use_auth_token,
+                revision=revision,
+                subfolder=subfolder,
+                user_agent=user_agent,
+                commit_hash=commit_hash,
+            )
+            model = cls.from_config(config, **unused_kwargs)
+            # Convert the weights
+            from .modeling_pytorch_flax_utils import load_flax_checkpoint_in_pytorch_model
+            model = load_flax_checkpoint_in_pytorch_model(model, model_file)
+        else:
+            if use_safetensors:
+                try:
+                    model_file = _get_model_file(
+                        pretrained_model_name_or_path,
+                        weights_name=_add_variant(SAFETENSORS_WEIGHTS_NAME, variant),
+                        cache_dir=cache_dir,
+                        force_download=force_download,
+                        resume_download=resume_download,
+                        proxies=proxies,
+                        local_files_only=local_files_only,
+                        use_auth_token=use_auth_token,
+                        revision=revision,
+                        subfolder=subfolder,
+                        user_agent=user_agent,
+                        commit_hash=commit_hash,
+                    )
+                except IOError as e:
+                    if not allow_pickle:
+                        raise e
+                    pass
+            if model_file is None:
+                model_file = _get_model_file(
+                    pretrained_model_name_or_path,
+                    weights_name=_add_variant(WEIGHTS_NAME, variant),
+                    cache_dir=cache_dir,
+                    force_download=force_download,
+                    resume_download=resume_download,
+                    proxies=proxies,
+                    local_files_only=local_files_only,
+                    use_auth_token=use_auth_token,
+                    revision=revision,
+                    subfolder=subfolder,
+                    user_agent=user_agent,
+                    commit_hash=commit_hash,
+                )
+            if low_cpu_mem_usage:
+                # Instantiate model with empty weights
+                with accelerate.init_empty_weights():
+                    model = cls.from_config(config, **unused_kwargs)
+                # if device_map is None, load the state dict and move the params from meta device to the cpu
+                if device_map is None:
+                    param_device = "cpu"
+                    state_dict = load_state_dict(model_file, variant=variant)
+                    # move the params from meta device to cpu
+                    missing_keys = set(model.state_dict().keys()) - set(state_dict.keys())
+                    if len(missing_keys) > 0:
+                        raise ValueError(
+                            f"Cannot load {cls} from {pretrained_model_name_or_path} because the following keys are"
+                            f" missing: \n {', '.join(missing_keys)}. \n Please make sure to pass"
+                            " `low_cpu_mem_usage=False` and `device_map=None` if you want to randomly initialize"
+                            " those weights or else make sure your checkpoint file is correct."
+                        )
+                    empty_state_dict = model.state_dict()
+                    for param_name, param in state_dict.items():
+                        accepts_dtype = "dtype" in set(
+                            inspect.signature(set_module_tensor_to_device).parameters.keys()
+                        )
+                        if empty_state_dict[param_name].shape != param.shape:
+                            raise ValueError(
+                                f"Cannot load {pretrained_model_name_or_path} because {param_name} expected shape {empty_state_dict[param_name]}, but got {param.shape}. If you want to instead overwrite randomly initialized weights, please make sure to pass both `low_cpu_mem_usage=False` and `ignore_mismatched_sizes=True`. For more information, see also: https://github.com/huggingface/diffusers/issues/1619#issuecomment-1345604389 as an example."
+                            )
+                        if accepts_dtype:
+                            set_module_tensor_to_device(
+                                model, param_name, param_device, value=param, dtype=torch_dtype
+                            )
+                        else:
+                            set_module_tensor_to_device(model, param_name, param_device, value=param)
+                else:  # else let accelerate handle loading and dispatching.
+                    # Load weights and dispatch according to the device_map
+                    # by default the device_map is None and the weights are loaded on the CPU
+                    accelerate.load_checkpoint_and_dispatch(model, model_file, device_map, dtype=torch_dtype)
+                loading_info = {
+                    "missing_keys": [],
+                    "unexpected_keys": [],
+                    "mismatched_keys": [],
+                    "error_msgs": [],
+                }
+            else:
+                model = cls.from_config(config, **unused_kwargs)
+                state_dict = load_state_dict(model_file, variant=variant)
+                model, missing_keys, unexpected_keys, mismatched_keys, error_msgs = cls._load_pretrained_model(
+                    model,
+                    state_dict,
+                    model_file,
+                    pretrained_model_name_or_path,
+                    ignore_mismatched_sizes=ignore_mismatched_sizes,
+                )
+                loading_info = {
+                    "missing_keys": missing_keys,
+                    "unexpected_keys": unexpected_keys,
+                    "mismatched_keys": mismatched_keys,
+                    "error_msgs": error_msgs,
+                }
+        if torch_dtype is not None and not isinstance(torch_dtype, torch.dtype):
+            raise ValueError(
+                f"{torch_dtype} needs to be of type `torch.dtype`, e.g. `torch.float16`, but is {type(torch_dtype)}."
+            )
+        elif torch_dtype is not None:
+            model = model.to(torch_dtype)
+        model.register_to_config(_name_or_path=pretrained_model_name_or_path)
+        # Set model in evaluation mode to deactivate DropOut modules by default
+        model.eval()
+        if output_loading_info:
+            return model, loading_info
+        return model
+    @classmethod
+    def _load_pretrained_model(
+        cls,
+        model,
+        state_dict,
+        resolved_archive_file,
+        pretrained_model_name_or_path,
+        ignore_mismatched_sizes=False,
+    ):
+        # Retrieve missing & unexpected_keys
+        model_state_dict = model.state_dict()
+        loaded_keys = list(state_dict.keys())
+        expected_keys = list(model_state_dict.keys())
+        original_loaded_keys = loaded_keys
+        missing_keys = list(set(expected_keys) - set(loaded_keys))
+        unexpected_keys = list(set(loaded_keys) - set(expected_keys))
+        # Make sure we are able to load base models as well as derived models (with heads)
+        model_to_load = model
+        def _find_mismatched_keys(
+            state_dict,
+            model_state_dict,
+            loaded_keys,
+            ignore_mismatched_sizes,
+        ):
+            mismatched_keys = []
+            if ignore_mismatched_sizes:
+                for checkpoint_key in loaded_keys:
+                    model_key = checkpoint_key
+                    if (
+                        model_key in model_state_dict
+                        and state_dict[checkpoint_key].shape != model_state_dict[model_key].shape
+                    ):
+                        mismatched_keys.append(
+                            (checkpoint_key, state_dict[checkpoint_key].shape, model_state_dict[model_key].shape)
+                        )
+                        del state_dict[checkpoint_key]
+            return mismatched_keys
+        if state_dict is not None:
+            # Whole checkpoint
+            mismatched_keys = _find_mismatched_keys(
+                state_dict,
+                model_state_dict,
+                original_loaded_keys,
+                ignore_mismatched_sizes,
+            )
+            error_msgs = _load_state_dict_into_model(model_to_load, state_dict)
+        if len(error_msgs) > 0:
+            error_msg = "\n\t".join(error_msgs)
+            if "size mismatch" in error_msg:
+                error_msg += (
+                    "\n\tYou may consider adding `ignore_mismatched_sizes=True` in the model `from_pretrained` method."
+                )
+            raise RuntimeError(f"Error(s) in loading state_dict for {model.__class__.__name__}:\n\t{error_msg}")
+        if len(unexpected_keys) > 0:
+            logger.warning(
+                f"Some weights of the model checkpoint at {pretrained_model_name_or_path} were not used when"
+                f" initializing {model.__class__.__name__}: {unexpected_keys}\n- This IS expected if you are"
+                f" initializing {model.__class__.__name__} from the checkpoint of a model trained on another task"
+                " or with another architecture (e.g. initializing a BertForSequenceClassification model from a"
+                " BertForPreTraining model).\n- This IS NOT expected if you are initializing"
+                f" {model.__class__.__name__} from the checkpoint of a model that you expect to be exactly"
+                " identical (initializing a BertForSequenceClassification model from a"
+                " BertForSequenceClassification model)."
+            )
+        else:
+            logger.info(f"All model checkpoint weights were used when initializing {model.__class__.__name__}.\n")
+        if len(missing_keys) > 0:
+            logger.warning(
+                f"Some weights of {model.__class__.__name__} were not initialized from the model checkpoint at"
+                f" {pretrained_model_name_or_path} and are newly initialized: {missing_keys}\nYou should probably"
+                " TRAIN this model on a down-stream task to be able to use it for predictions and inference."
+            )
+        elif len(mismatched_keys) == 0:
+            logger.info(
+                f"All the weights of {model.__class__.__name__} were initialized from the model checkpoint at"
+                f" {pretrained_model_name_or_path}.\nIf your task is similar to the task the model of the"
+                f" checkpoint was trained on, you can already use {model.__class__.__name__} for predictions"
+                " without further training."
+            )
+        if len(mismatched_keys) > 0:
+            mismatched_warning = "\n".join(
+                [
+                    f"- {key}: found shape {shape1} in the checkpoint and {shape2} in the model instantiated"
+                    for key, shape1, shape2 in mismatched_keys
+                ]
+            )
+            logger.warning(
+                f"Some weights of {model.__class__.__name__} were not initialized from the model checkpoint at"
+                f" {pretrained_model_name_or_path} and are newly initialized because the shapes did not"
+                f" match:\n{mismatched_warning}\nYou should probably TRAIN this model on a down-stream task to be"
+                " able to use it for predictions and inference."
+            )
+        return model, missing_keys, unexpected_keys, mismatched_keys, error_msgs
+    @property
+    def device(self) -> device:
+        """
+        `torch.device`: The device on which the module is (assuming that all the module parameters are on the same
+        device).
+        """
+        return get_parameter_device(self)
+    @property
+    def dtype(self) -> torch.dtype:
+        """
+        `torch.dtype`: The dtype of the module (assuming that all the module parameters have the same dtype).
+        """
+        return get_parameter_dtype(self)
+    def num_parameters(self, only_trainable: bool = False, exclude_embeddings: bool = False) -> int:
+        """
+        Get number of (optionally, trainable or non-embeddings) parameters in the module.
+        Args:
+            only_trainable (`bool`, *optional*, defaults to `False`):
+                Whether or not to return only the number of trainable parameters
+            exclude_embeddings (`bool`, *optional*, defaults to `False`):
+                Whether or not to return only the number of non-embeddings parameters
+        Returns:
+            `int`: The number of parameters.
+        """
+        if exclude_embeddings:
+            embedding_param_names = [
+                f"{name}.weight"
+                for name, module_type in self.named_modules()
+                if isinstance(module_type, torch.nn.Embedding)
+            ]
+            non_embedding_parameters = [
+                parameter for name, parameter in self.named_parameters() if name not in embedding_param_names
+            ]
+            return sum(p.numel() for p in non_embedding_parameters if p.requires_grad or not only_trainable)
+        else:
+            return sum(p.numel() for p in self.parameters() if p.requires_grad or not only_trainable)
+def Downsample(dim, dim_out):
+    return nn.Conv2d(dim, dim_out, 4, 2, 1)
+class Residual(nn.Sequential):
+    def forward(self, input):
+        x = input
+        for module in self:
+            x = module(x)
+        return x + input
+def ConvLayer(dim, dim_out, *, kernel_size=3, groups=32):
+    return nn.Sequential(
+        nn.GroupNorm(groups, dim),
+        nn.SiLU(),
+        nn.Conv2d(dim, dim_out, kernel_size=kernel_size, padding=kernel_size//2),
+    )
+def ResnetBlock(dim, *, kernel_size=3, groups=32):
+    return Residual(
+        ConvLayer(dim, dim, kernel_size=kernel_size, groups=groups),
+        ConvLayer(dim, dim, kernel_size=kernel_size, groups=groups),
+    )
+class SelfAttention(nn.Module):
+    def __init__(self, dim, out_dim, *, heads=8, key_dim=32, value_dim=32):
+        super().__init__()
+        self.dim = dim
+        self.out_dim = dim
+        self.heads = heads
+        self.key_dim = key_dim
+        self.to_k = nn.Linear(dim, key_dim)
+        self.to_v = nn.Linear(dim, value_dim)
+        self.to_q = nn.Linear(dim, key_dim * heads)
+        self.to_out = nn.Linear(value_dim * heads, out_dim)
+    def forward(self, x):
+        shape = x.shape
+        x = einops.rearrange(x, 'b c ... -> b (...) c')
+        k = self.to_k(x)
+        v = self.to_v(x)
+        q = self.to_q(x)
+        q = einops.rearrange(q, 'b n (h c) -> b (n h) c', h=self.heads)
+        if hasattr(nn.functional, "scaled_dot_product_attention"):
+            result = F.scaled_dot_product_attention(q, k, v)
+        else:
+            attention_scores = torch.bmm(q, k.transpose(-2, -1))
+            attention_probs = torch.softmax(attention_scores.float() / math.sqrt(self.key_dim), dim=-1).type(attention_scores.dtype)
+            result = torch.bmm(attention_probs, v)
+        result = einops.rearrange(result, 'b (n h) c -> b n (h c)', h=self.heads)
+        out = self.to_out(result)
+        out = einops.rearrange(out, 'b n c -> b c n')
+        out = torch.reshape(out, (shape[0], self.out_dim, *shape[2:]))
+        return out
+def SelfAttentionBlock(dim, attention_dim, *, heads=8, groups=32):
+    if not attention_dim:
+        attention_dim = dim // heads
+    return Residual(
+        nn.GroupNorm(groups, dim),
+        SelfAttention(dim, dim, heads=heads, key_dim=attention_dim, value_dim=attention_dim),
+    )
+class Discriminator2D(ModelMixin, ConfigMixin):
+    @register_to_config
+    def __init__(
+        self,
+        in_channels: int = 8,
+        out_channels: int = 1,
+        block_out_channels: Tuple[int] = (128, 256, 512, 1024, 1024, 1024),
+        block_repeats: Tuple[int] = (2, 2, 2, 2, 2),
+        downsample_blocks: Tuple[int] = (0, 1, 2),
+        attention_blocks: Tuple[int] = (1, 2, 3, 4),
+        mlp_hidden_channels: Tuple[int] = (2048, 2048, 2048),
+        mlp_uses_norm: bool = True,
+        attention_dim: Optional[int] = None,
+        attention_heads: int = 8,
+        groups: int = 32,
+        embedding_dim: int = 768,
+    ):
+        super().__init__()
+        self.blocks = nn.ModuleList([])
+        self.conv_in = nn.Conv2d(in_channels, block_out_channels[0], 7, padding=3)
+        for i in range(0, len(block_out_channels) - 1):
+            block_in = block_out_channels[i]
+            block_out = block_out_channels[i + 1]
+            block = nn.Sequential()
+            for j in range(0, block_repeats[i]):
+                if i in attention_blocks:
+                    block.append(SelfAttentionBlock(block_in, attention_dim, heads=attention_heads, groups=groups))
+                block.append(ResnetBlock(block_in, groups=groups))
+            if i in downsample_blocks:
+                block.append(Downsample(block_in, block_out))
+            elif block_in != block_out:
+                block.append(nn.Conv2d(block_in, block_out, 1))
+            self.blocks.append(block)
+        # A simple MLP to make the final decision based on statistics from
+        # the output of every block
+        self.to_out = nn.Sequential()
+        d_channels = 2 * sum(block_out_channels[1:]) + embedding_dim
+        for c in mlp_hidden_channels:
+            self.to_out.append(nn.Linear(d_channels, c))
+            if mlp_uses_norm:
+                self.to_out.append(nn.GroupNorm(groups, c))
+            self.to_out.append(nn.SiLU())
+            d_channels = c
+        self.to_out.append(nn.Linear(d_channels, out_channels))
+        self.gradient_checkpointing = False
+    def enable_gradient_checkpointing(self):
+        self.gradient_checkpointing = True
+    def disable_gradient_checkpointing(self):
+        self.gradient_checkpointing = False
+    def forward(self, x, encoder_hidden_states):
+        x = self.conv_in(x)
+        if self.config.embedding_dim != 0:
+            d = einops.reduce(encoder_hidden_states, 'b n c -> b c', 'mean')
+        else:
+            d = torch.zeros([x.shape[0], 0], device=x.device, dtype=x.dtype)
+        for block in self.blocks:
+            if self.gradient_checkpointing:
+                x = torch.utils.checkpoint.checkpoint(block, x)
+            else:
+                x = block(x)
+            x_mean = einops.reduce(x, 'b c ... -> b c', 'mean')
+            x_max = einops.reduce(x, 'b c ... -> b c', 'max')
+            d = torch.cat([d, x_mean, x_max], dim=-1)
+        return self.to_out(d)

StableTuner_RunPod_Fix/lion_pytorch.py ADDED Viewed

	@@ -0,0 +1,88 @@

+from typing import Tuple, Optional, Callable
+import torch
+from torch.optim.optimizer import Optimizer
+# functions
+def exists(val):
+    return val is not None
+# update functions
+def update_fn(p, grad, exp_avg, lr, wd, beta1, beta2):
+    # stepweight decay
+    p.data.mul_(1 - lr * wd)
+    # weight update
+    update = exp_avg.clone().mul_(beta1).add(grad, alpha = 1 - beta1).sign_()
+    p.add_(update, alpha = -lr)
+    # decay the momentum running average coefficient
+    exp_avg.mul_(beta2).add_(grad, alpha = 1 - beta2)
+# class
+class Lion(Optimizer):
+    def __init__(
+        self,
+        params,
+        lr: float = 1e-4,
+        betas: Tuple[float, float] = (0.9, 0.99),
+        weight_decay: float = 0.0,
+        use_triton: bool = False
+    ):
+        assert lr > 0.
+        assert all([0. <= beta <= 1. for beta in betas])
+        defaults = dict(
+            lr = lr,
+            betas = betas,
+            weight_decay = weight_decay
+        )
+        super().__init__(params, defaults)
+        self.update_fn = update_fn
+        if use_triton:
+            from lion_pytorch.triton import update_fn as triton_update_fn
+            self.update_fn = triton_update_fn
+    @torch.no_grad()
+    def step(
+        self,
+        closure: Optional[Callable] = None
+    ):
+        loss = None
+        if exists(closure):
+            with torch.enable_grad():
+                loss = closure()
+        for group in self.param_groups:
+            for p in filter(lambda p: exists(p.grad), group['params']):
+                grad, lr, wd, beta1, beta2, state = p.grad, group['lr'], group['weight_decay'], *group['betas'], self.state[p]
+                # init state - exponential moving average of gradient values
+                if len(state) == 0:
+                    state['exp_avg'] = torch.zeros_like(p)
+                exp_avg = state['exp_avg']
+                self.update_fn(
+                    p,
+                    grad,
+                    exp_avg,
+                    lr,
+                    wd,
+                    beta1,
+                    beta2
+                )
+        return loss

StableTuner_RunPod_Fix/lora_utils.py ADDED Viewed

	@@ -0,0 +1,236 @@

+# LoRA network module
+# reference:
+# https://github.com/microsoft/LoRA/blob/main/loralib/layers.py
+# https://github.com/cloneofsimo/lora/blob/master/lora_diffusion/lora.py
+import math
+import os
+import torch
+from trainer_util import *
+class LoRAModule(torch.nn.Module):
+  """
+  replaces forward method of the original Linear, instead of replacing the original Linear module.
+  """
+  def __init__(self, lora_name, org_module: torch.nn.Module, multiplier=1.0, lora_dim=4, alpha=1):
+    """ if alpha == 0 or None, alpha is rank (no scaling). """
+    super().__init__()
+    self.lora_name = lora_name
+    self.lora_dim = lora_dim
+    if org_module.__class__.__name__ == 'Conv2d':
+      in_dim = org_module.in_channels
+      out_dim = org_module.out_channels
+      self.lora_down = torch.nn.Conv2d(in_dim, lora_dim, (1, 1), bias=False)
+      self.lora_up = torch.nn.Conv2d(lora_dim, out_dim, (1, 1), bias=False)
+    else:
+      in_dim = org_module.in_features
+      out_dim = org_module.out_features
+      self.lora_down = torch.nn.Linear(in_dim, lora_dim, bias=False)
+      self.lora_up = torch.nn.Linear(lora_dim, out_dim, bias=False)
+    if type(alpha) == torch.Tensor:
+      alpha = alpha.detach().float().numpy()                              # without casting, bf16 causes error
+    alpha = lora_dim if alpha is None or alpha == 0 else alpha
+    self.scale = alpha / self.lora_dim
+    self.register_buffer('alpha', torch.tensor(alpha))                    # 定数として扱える
+    # same as microsoft's
+    torch.nn.init.kaiming_uniform_(self.lora_down.weight, a=math.sqrt(5))
+    torch.nn.init.zeros_(self.lora_up.weight)
+    self.multiplier = multiplier
+    self.org_module = org_module                  # remove in applying
+  def apply_to(self):
+    self.org_forward = self.org_module.forward
+    self.org_module.forward = self.forward
+    del self.org_module
+  def forward(self, x):
+    return self.org_forward(x) + self.lora_up(self.lora_down(x)) * self.multiplier * self.scale
+def create_network(multiplier, network_dim, network_alpha, vae, text_encoder, unet, **kwargs):
+  if network_dim is None:
+    network_dim = 4                     # default
+  network = LoRANetwork(text_encoder, unet, multiplier=multiplier, lora_dim=network_dim, alpha=network_alpha)
+  return network
+def create_network_from_weights(multiplier, file, vae, text_encoder, unet, **kwargs):
+  if os.path.splitext(file)[1] == '.safetensors':
+    from safetensors.torch import load_file, safe_open
+    weights_sd = load_file(file)
+  else:
+    weights_sd = torch.load(file, map_location='cpu')
+  # get dim (rank)
+  network_alpha = None
+  network_dim = None
+  for key, value in weights_sd.items():
+    if network_alpha is None and 'alpha' in key:
+      network_alpha = value
+    if network_dim is None and 'lora_down' in key and len(value.size()) == 2:
+      network_dim = value.size()[0]
+  if network_alpha is None:
+    network_alpha = network_dim
+  network = LoRANetwork(text_encoder, unet, multiplier=multiplier, lora_dim=network_dim, alpha=network_alpha)
+  network.weights_sd = weights_sd
+  return network
+class LoRANetwork(torch.nn.Module):
+  UNET_TARGET_REPLACE_MODULE = ["Transformer2DModel", "Attention"]
+  TEXT_ENCODER_TARGET_REPLACE_MODULE = ["CLIPAttention", "CLIPMLP"]
+  LORA_PREFIX_UNET = 'lora_unet'
+  LORA_PREFIX_TEXT_ENCODER = 'lora_te'
+  def __init__(self, text_encoder, unet, multiplier=1.0, lora_dim=4, alpha=1) -> None:
+    super().__init__()
+    self.multiplier = multiplier
+    self.lora_dim = lora_dim
+    self.alpha = alpha
+    # create module instances
+    def create_modules(prefix, root_module: torch.nn.Module, target_replace_modules) -> list[LoRAModule]:
+      loras = []
+      for name, module in root_module.named_modules():
+        if module.__class__.__name__ in target_replace_modules:
+          for child_name, child_module in module.named_modules():
+            if child_module.__class__.__name__ == "Linear" or (child_module.__class__.__name__ == "Conv2d" and child_module.kernel_size == (1, 1)):
+              lora_name = prefix + '.' + name + '.' + child_name
+              lora_name = lora_name.replace('.', '_')
+              lora = LoRAModule(lora_name, child_module, self.multiplier, self.lora_dim, self.alpha)
+              loras.append(lora)
+      return loras
+    self.text_encoder_loras = create_modules(LoRANetwork.LORA_PREFIX_TEXT_ENCODER,
+                                             text_encoder, LoRANetwork.TEXT_ENCODER_TARGET_REPLACE_MODULE)
+    print(f"create LoRA for Text Encoder: {len(self.text_encoder_loras)} modules.")
+    self.unet_loras = create_modules(LoRANetwork.LORA_PREFIX_UNET, unet, LoRANetwork.UNET_TARGET_REPLACE_MODULE)
+    print(f"create LoRA for U-Net: {len(self.unet_loras)} modules.")
+    self.weights_sd = None
+    # assertion
+    names = set()
+    for lora in self.text_encoder_loras + self.unet_loras:
+      assert lora.lora_name not in names, f"duplicated lora name: {lora.lora_name}"
+      names.add(lora.lora_name)
+  def load_weights(self, file):
+    if os.path.splitext(file)[1] == '.safetensors':
+      from safetensors.torch import load_file, safe_open
+      self.weights_sd = load_file(file)
+    else:
+      self.weights_sd = torch.load(file, map_location='cpu')
+  def apply_to(self, text_encoder, unet, apply_text_encoder=None, apply_unet=None):
+    if self.weights_sd:
+      weights_has_text_encoder = weights_has_unet = False
+      for key in self.weights_sd.keys():
+        if key.startswith(LoRANetwork.LORA_PREFIX_TEXT_ENCODER):
+          weights_has_text_encoder = True
+        elif key.startswith(LoRANetwork.LORA_PREFIX_UNET):
+          weights_has_unet = True
+      if apply_text_encoder is None:
+        apply_text_encoder = weights_has_text_encoder
+      else:
+        assert apply_text_encoder == weights_has_text_encoder, f"text encoder weights: {weights_has_text_encoder} but text encoder flag: {apply_text_encoder} / 重みとText Encoderのフラグが矛盾しています"
+      if apply_unet is None:
+        apply_unet = weights_has_unet
+      else:
+        assert apply_unet == weights_has_unet, f"u-net weights: {weights_has_unet} but u-net flag: {apply_unet} / 重みとU-Netのフラグが矛盾しています"
+    else:
+      assert apply_text_encoder is not None and apply_unet is not None, f"internal error: flag not set"
+    if apply_text_encoder:
+      print("enable LoRA for text encoder")
+    else:
+      self.text_encoder_loras = []
+    if apply_unet:
+      print("enable LoRA for U-Net")
+    else:
+      self.unet_loras = []
+    for lora in self.text_encoder_loras + self.unet_loras:
+      lora.apply_to()
+      self.add_module(lora.lora_name, lora)
+    if self.weights_sd:
+      # if some weights are not in state dict, it is ok because initial LoRA does nothing (lora_up is initialized by zeros)
+      info = self.load_state_dict(self.weights_sd, False)
+      print(f"weights are loaded: {info}")
+  def enable_gradient_checkpointing(self):
+    # not supported
+    pass
+  def prepare_optimizer_params(self, text_encoder_lr, unet_lr):
+    def enumerate_params(loras):
+      params = []
+      for lora in loras:
+        params.extend(lora.parameters())
+      return params
+    self.requires_grad_(True)
+    all_params = []
+    if self.text_encoder_loras:
+      param_data = {'params': enumerate_params(self.text_encoder_loras)}
+      if text_encoder_lr is not None:
+        param_data['lr'] = text_encoder_lr
+      all_params.append(param_data)
+    if self.unet_loras:
+      param_data = {'params': enumerate_params(self.unet_loras)}
+      if unet_lr is not None:
+        param_data['lr'] = unet_lr
+      all_params.append(param_data)
+    return all_params
+  def prepare_grad_etc(self, text_encoder, unet):
+    self.requires_grad_(True)
+  def on_epoch_start(self, text_encoder, unet):
+    self.train()
+  def get_trainable_params(self):
+    return self.parameters()
+  def save_weights(self, file, dtype, metadata):
+    if metadata is not None and len(metadata) == 0:
+      metadata = None
+    state_dict = self.state_dict()
+    if dtype is not None:
+      for key in list(state_dict.keys()):
+        v = state_dict[key]
+        v = v.detach().clone().to("cpu").to(dtype)
+        state_dict[key] = v
+    if os.path.splitext(file)[1] == '.safetensors':
+      from safetensors.torch import save_file
+      # Precalculate model hashes to save time on indexing
+      if metadata is None:
+        metadata = {}
+      model_hash, legacy_hash = train_util.precalculate_safetensors_hashes(state_dict, metadata)
+      metadata["sshs_model_hash"] = model_hash
+      metadata["sshs_legacy_hash"] = legacy_hash
+      save_file(state_dict, file, metadata)
+    else:
+      torch.save(state_dict, file)

StableTuner_RunPod_Fix/model_util.py ADDED Viewed

	@@ -0,0 +1,1543 @@

+# v1: split from train_db_fixed.py.
+# v2: support safetensors
+import math
+import os
+import torch
+from transformers import CLIPTextModel, CLIPTokenizer, CLIPTextConfig
+from diffusers import (
+    AutoencoderKL,
+    DDIMScheduler,
+    StableDiffusionPipeline,
+    UNet2DConditionModel,
+)
+from safetensors.torch import load_file, save_file
+# DiffUsers版StableDiffusionのモデルパラメータ
+NUM_TRAIN_TIMESTEPS = 1000
+BETA_START = 0.00085
+BETA_END = 0.0120
+UNET_PARAMS_MODEL_CHANNELS = 320
+UNET_PARAMS_CHANNEL_MULT = [1, 2, 4, 4]
+UNET_PARAMS_ATTENTION_RESOLUTIONS = [4, 2, 1]
+UNET_PARAMS_IMAGE_SIZE = 32  # unused
+UNET_PARAMS_IN_CHANNELS = 4
+UNET_PARAMS_OUT_CHANNELS = 4
+UNET_PARAMS_NUM_RES_BLOCKS = 2
+UNET_PARAMS_CONTEXT_DIM = 768
+UNET_PARAMS_NUM_HEADS = 8
+VAE_PARAMS_Z_CHANNELS = 4
+VAE_PARAMS_RESOLUTION = 256
+VAE_PARAMS_IN_CHANNELS = 3
+VAE_PARAMS_OUT_CH = 3
+VAE_PARAMS_CH = 128
+VAE_PARAMS_CH_MULT = [1, 2, 4, 4]
+VAE_PARAMS_NUM_RES_BLOCKS = 2
+# V2
+V2_UNET_PARAMS_ATTENTION_HEAD_DIM = [5, 10, 20, 20]
+V2_UNET_PARAMS_CONTEXT_DIM = 1024
+# Diffusersの設定を読み込むための参照モデル
+DIFFUSERS_REF_MODEL_ID_V1 = 'runwayml/stable-diffusion-v1-5'
+DIFFUSERS_REF_MODEL_ID_V2 = 'stabilityai/stable-diffusion-2-1'
+# region StableDiffusion->Diffusersの変換コード
+# convert_original_stable_diffusion_to_diffusers をコピーして修正している（ASL 2.0）
+def shave_segments(path, n_shave_prefix_segments=1):
+    """
+    Removes segments. Positive values shave the first segments, negative shave the last segments.
+    """
+    if n_shave_prefix_segments >= 0:
+        return '.'.join(path.split('.')[n_shave_prefix_segments:])
+    else:
+        return '.'.join(path.split('.')[:n_shave_prefix_segments])
+def renew_resnet_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside resnets to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item.replace('in_layers.0', 'norm1')
+        new_item = new_item.replace('in_layers.2', 'conv1')
+        new_item = new_item.replace('out_layers.0', 'norm2')
+        new_item = new_item.replace('out_layers.3', 'conv2')
+        new_item = new_item.replace('emb_layers.1', 'time_emb_proj')
+        new_item = new_item.replace('skip_connection', 'conv_shortcut')
+        new_item = shave_segments(
+            new_item, n_shave_prefix_segments=n_shave_prefix_segments
+        )
+        mapping.append({'old': old_item, 'new': new_item})
+    return mapping
+def renew_vae_resnet_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside resnets to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item
+        new_item = new_item.replace('nin_shortcut', 'conv_shortcut')
+        new_item = shave_segments(
+            new_item, n_shave_prefix_segments=n_shave_prefix_segments
+        )
+        mapping.append({'old': old_item, 'new': new_item})
+    return mapping
+def renew_attention_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside attentions to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item
+        #         new_item = new_item.replace('norm.weight', 'group_norm.weight')
+        #         new_item = new_item.replace('norm.bias', 'group_norm.bias')
+        #         new_item = new_item.replace('proj_out.weight', 'proj_attn.weight')
+        #         new_item = new_item.replace('proj_out.bias', 'proj_attn.bias')
+        #         new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+        mapping.append({'old': old_item, 'new': new_item})
+    return mapping
+def renew_vae_attention_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside attentions to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item
+        new_item = new_item.replace('norm.weight', 'group_norm.weight')
+        new_item = new_item.replace('norm.bias', 'group_norm.bias')
+        new_item = new_item.replace('q.weight', 'query.weight')
+        new_item = new_item.replace('q.bias', 'query.bias')
+        new_item = new_item.replace('k.weight', 'key.weight')
+        new_item = new_item.replace('k.bias', 'key.bias')
+        new_item = new_item.replace('v.weight', 'value.weight')
+        new_item = new_item.replace('v.bias', 'value.bias')
+        new_item = new_item.replace('proj_out.weight', 'proj_attn.weight')
+        new_item = new_item.replace('proj_out.bias', 'proj_attn.bias')
+        new_item = shave_segments(
+            new_item, n_shave_prefix_segments=n_shave_prefix_segments
+        )
+        mapping.append({'old': old_item, 'new': new_item})
+    return mapping
+def assign_to_checkpoint(
+    paths,
+    checkpoint,
+    old_checkpoint,
+    attention_paths_to_split=None,
+    additional_replacements=None,
+    config=None,
+):
+    """
+    This does the final conversion step: take locally converted weights and apply a global renaming
+    to them. It splits attention layers, and takes into account additional replacements
+    that may arise.
+    Assigns the weights to the new checkpoint.
+    """
+    assert isinstance(
+        paths, list
+    ), "Paths should be a list of dicts containing 'old' and 'new' keys."
+    # Splits the attention layers into three variables.
+    if attention_paths_to_split is not None:
+        for path, path_map in attention_paths_to_split.items():
+            old_tensor = old_checkpoint[path]
+            channels = old_tensor.shape[0] // 3
+            target_shape = (
+                (-1, channels) if len(old_tensor.shape) == 3 else (-1)
+            )
+            num_heads = old_tensor.shape[0] // config['num_head_channels'] // 3
+            old_tensor = old_tensor.reshape(
+                (num_heads, 3 * channels // num_heads) + old_tensor.shape[1:]
+            )
+            query, key, value = old_tensor.split(channels // num_heads, dim=1)
+            checkpoint[path_map['query']] = query.reshape(target_shape)
+            checkpoint[path_map['key']] = key.reshape(target_shape)
+            checkpoint[path_map['value']] = value.reshape(target_shape)
+    for path in paths:
+        new_path = path['new']
+        # These have already been assigned
+        if (
+            attention_paths_to_split is not None
+            and new_path in attention_paths_to_split
+        ):
+            continue
+        # Global renaming happens here
+        new_path = new_path.replace('middle_block.0', 'mid_block.resnets.0')
+        new_path = new_path.replace('middle_block.1', 'mid_block.attentions.0')
+        new_path = new_path.replace('middle_block.2', 'mid_block.resnets.1')
+        if additional_replacements is not None:
+            for replacement in additional_replacements:
+                new_path = new_path.replace(
+                    replacement['old'], replacement['new']
+                )
+        # proj_attn.weight has to be converted from conv 1D to linear
+        if 'proj_attn.weight' in new_path:
+            checkpoint[new_path] = old_checkpoint[path['old']][:, :, 0]
+        else:
+            checkpoint[new_path] = old_checkpoint[path['old']]
+def conv_attn_to_linear(checkpoint):
+    keys = list(checkpoint.keys())
+    attn_keys = ['query.weight', 'key.weight', 'value.weight']
+    for key in keys:
+        if '.'.join(key.split('.')[-2:]) in attn_keys:
+            if checkpoint[key].ndim > 2:
+                checkpoint[key] = checkpoint[key][:, :, 0, 0]
+        elif 'proj_attn.weight' in key:
+            if checkpoint[key].ndim > 2:
+                checkpoint[key] = checkpoint[key][:, :, 0]
+def linear_transformer_to_conv(checkpoint):
+    keys = list(checkpoint.keys())
+    tf_keys = ['proj_in.weight', 'proj_out.weight']
+    for key in keys:
+        if '.'.join(key.split('.')[-2:]) in tf_keys:
+            if checkpoint[key].ndim == 2:
+                checkpoint[key] = checkpoint[key].unsqueeze(2).unsqueeze(2)
+def convert_ldm_unet_checkpoint(v2, checkpoint, config):
+    """
+    Takes a state dict and a config, and returns a converted checkpoint.
+    """
+    # extract state_dict for UNet
+    unet_state_dict = {}
+    unet_key = 'model.diffusion_model.'
+    keys = list(checkpoint.keys())
+    for key in keys:
+        if key.startswith(unet_key):
+            unet_state_dict[key.replace(unet_key, '')] = checkpoint.pop(key)
+    new_checkpoint = {}
+    new_checkpoint['time_embedding.linear_1.weight'] = unet_state_dict[
+        'time_embed.0.weight'
+    ]
+    new_checkpoint['time_embedding.linear_1.bias'] = unet_state_dict[
+        'time_embed.0.bias'
+    ]
+    new_checkpoint['time_embedding.linear_2.weight'] = unet_state_dict[
+        'time_embed.2.weight'
+    ]
+    new_checkpoint['time_embedding.linear_2.bias'] = unet_state_dict[
+        'time_embed.2.bias'
+    ]
+    new_checkpoint['conv_in.weight'] = unet_state_dict[
+        'input_blocks.0.0.weight'
+    ]
+    new_checkpoint['conv_in.bias'] = unet_state_dict['input_blocks.0.0.bias']
+    new_checkpoint['conv_norm_out.weight'] = unet_state_dict['out.0.weight']
+    new_checkpoint['conv_norm_out.bias'] = unet_state_dict['out.0.bias']
+    new_checkpoint['conv_out.weight'] = unet_state_dict['out.2.weight']
+    new_checkpoint['conv_out.bias'] = unet_state_dict['out.2.bias']
+    # Retrieves the keys for the input blocks only
+    num_input_blocks = len(
+        {
+            '.'.join(layer.split('.')[:2])
+            for layer in unet_state_dict
+            if 'input_blocks' in layer
+        }
+    )
+    input_blocks = {
+        layer_id: [
+            key
+            for key in unet_state_dict
+            if f'input_blocks.{layer_id}.' in key
+        ]
+        for layer_id in range(num_input_blocks)
+    }
+    # Retrieves the keys for the middle blocks only
+    num_middle_blocks = len(
+        {
+            '.'.join(layer.split('.')[:2])
+            for layer in unet_state_dict
+            if 'middle_block' in layer
+        }
+    )
+    middle_blocks = {
+        layer_id: [
+            key
+            for key in unet_state_dict
+            if f'middle_block.{layer_id}.' in key
+        ]
+        for layer_id in range(num_middle_blocks)
+    }
+    # Retrieves the keys for the output blocks only
+    num_output_blocks = len(
+        {
+            '.'.join(layer.split('.')[:2])
+            for layer in unet_state_dict
+            if 'output_blocks' in layer
+        }
+    )
+    output_blocks = {
+        layer_id: [
+            key
+            for key in unet_state_dict
+            if f'output_blocks.{layer_id}.' in key
+        ]
+        for layer_id in range(num_output_blocks)
+    }
+    for i in range(1, num_input_blocks):
+        block_id = (i - 1) // (config['layers_per_block'] + 1)
+        layer_in_block_id = (i - 1) % (config['layers_per_block'] + 1)
+        resnets = [
+            key
+            for key in input_blocks[i]
+            if f'input_blocks.{i}.0' in key
+            and f'input_blocks.{i}.0.op' not in key
+        ]
+        attentions = [
+            key for key in input_blocks[i] if f'input_blocks.{i}.1' in key
+        ]
+        if f'input_blocks.{i}.0.op.weight' in unet_state_dict:
+            new_checkpoint[
+                f'down_blocks.{block_id}.downsamplers.0.conv.weight'
+            ] = unet_state_dict.pop(f'input_blocks.{i}.0.op.weight')
+            new_checkpoint[
+                f'down_blocks.{block_id}.downsamplers.0.conv.bias'
+            ] = unet_state_dict.pop(f'input_blocks.{i}.0.op.bias')
+        paths = renew_resnet_paths(resnets)
+        meta_path = {
+            'old': f'input_blocks.{i}.0',
+            'new': f'down_blocks.{block_id}.resnets.{layer_in_block_id}',
+        }
+        assign_to_checkpoint(
+            paths,
+            new_checkpoint,
+            unet_state_dict,
+            additional_replacements=[meta_path],
+            config=config,
+        )
+        if len(attentions):
+            paths = renew_attention_paths(attentions)
+            meta_path = {
+                'old': f'input_blocks.{i}.1',
+                'new': f'down_blocks.{block_id}.attentions.{layer_in_block_id}',
+            }
+            assign_to_checkpoint(
+                paths,
+                new_checkpoint,
+                unet_state_dict,
+                additional_replacements=[meta_path],
+                config=config,
+            )
+    resnet_0 = middle_blocks[0]
+    attentions = middle_blocks[1]
+    resnet_1 = middle_blocks[2]
+    resnet_0_paths = renew_resnet_paths(resnet_0)
+    assign_to_checkpoint(
+        resnet_0_paths, new_checkpoint, unet_state_dict, config=config
+    )
+    resnet_1_paths = renew_resnet_paths(resnet_1)
+    assign_to_checkpoint(
+        resnet_1_paths, new_checkpoint, unet_state_dict, config=config
+    )
+    attentions_paths = renew_attention_paths(attentions)
+    meta_path = {'old': 'middle_block.1', 'new': 'mid_block.attentions.0'}
+    assign_to_checkpoint(
+        attentions_paths,
+        new_checkpoint,
+        unet_state_dict,
+        additional_replacements=[meta_path],
+        config=config,
+    )
+    for i in range(num_output_blocks):
+        block_id = i // (config['layers_per_block'] + 1)
+        layer_in_block_id = i % (config['layers_per_block'] + 1)
+        output_block_layers = [
+            shave_segments(name, 2) for name in output_blocks[i]
+        ]
+        output_block_list = {}
+        for layer in output_block_layers:
+            layer_id, layer_name = layer.split('.')[0], shave_segments(
+                layer, 1
+            )
+            if layer_id in output_block_list:
+                output_block_list[layer_id].append(layer_name)
+            else:
+                output_block_list[layer_id] = [layer_name]
+        if len(output_block_list) > 1:
+            resnets = [
+                key
+                for key in output_blocks[i]
+                if f'output_blocks.{i}.0' in key
+            ]
+            attentions = [
+                key
+                for key in output_blocks[i]
+                if f'output_blocks.{i}.1' in key
+            ]
+            resnet_0_paths = renew_resnet_paths(resnets)
+            paths = renew_resnet_paths(resnets)
+            meta_path = {
+                'old': f'output_blocks.{i}.0',
+                'new': f'up_blocks.{block_id}.resnets.{layer_in_block_id}',
+            }
+            assign_to_checkpoint(
+                paths,
+                new_checkpoint,
+                unet_state_dict,
+                additional_replacements=[meta_path],
+                config=config,
+            )
+            # オリジナル：
+            # if ["conv.weight", "conv.bias"] in output_block_list.values():
+            #   index = list(output_block_list.values()).index(["conv.weight", "conv.bias"])
+            # biasとweightの順番に依存しないようにする：もっといいやり方がありそうだが
+            for l in output_block_list.values():
+                l.sort()
+            if ['conv.bias', 'conv.weight'] in output_block_list.values():
+                index = list(output_block_list.values()).index(
+                    ['conv.bias', 'conv.weight']
+                )
+                new_checkpoint[
+                    f'up_blocks.{block_id}.upsamplers.0.conv.bias'
+                ] = unet_state_dict[f'output_blocks.{i}.{index}.conv.bias']
+                new_checkpoint[
+                    f'up_blocks.{block_id}.upsamplers.0.conv.weight'
+                ] = unet_state_dict[f'output_blocks.{i}.{index}.conv.weight']
+                # Clear attentions as they have been attributed above.
+                if len(attentions) == 2:
+                    attentions = []
+            if len(attentions):
+                paths = renew_attention_paths(attentions)
+                meta_path = {
+                    'old': f'output_blocks.{i}.1',
+                    'new': f'up_blocks.{block_id}.attentions.{layer_in_block_id}',
+                }
+                assign_to_checkpoint(
+                    paths,
+                    new_checkpoint,
+                    unet_state_dict,
+                    additional_replacements=[meta_path],
+                    config=config,
+                )
+        else:
+            resnet_0_paths = renew_resnet_paths(
+                output_block_layers, n_shave_prefix_segments=1
+            )
+            for path in resnet_0_paths:
+                old_path = '.'.join(['output_blocks', str(i), path['old']])
+                new_path = '.'.join(
+                    [
+                        'up_blocks',
+                        str(block_id),
+                        'resnets',
+                        str(layer_in_block_id),
+                        path['new'],
+                    ]
+                )
+                new_checkpoint[new_path] = unet_state_dict[old_path]
+    # SDのv2では1*1のconv2dがlinearに変わっているので、linear->convに変換する
+    if v2:
+        linear_transformer_to_conv(new_checkpoint)
+    return new_checkpoint
+def convert_ldm_vae_checkpoint(checkpoint, config):
+    # extract state dict for VAE
+    vae_state_dict = {}
+    vae_key = 'first_stage_model.'
+    keys = list(checkpoint.keys())
+    for key in keys:
+        if key.startswith(vae_key):
+            vae_state_dict[key.replace(vae_key, '')] = checkpoint.get(key)
+    # if len(vae_state_dict) == 0:
+    #   # 渡されたcheckpointは.ckptから読み込んだcheckpointではなくvaeのstate_dict
+    #   vae_state_dict = checkpoint
+    new_checkpoint = {}
+    new_checkpoint['encoder.conv_in.weight'] = vae_state_dict[
+        'encoder.conv_in.weight'
+    ]
+    new_checkpoint['encoder.conv_in.bias'] = vae_state_dict[
+        'encoder.conv_in.bias'
+    ]
+    new_checkpoint['encoder.conv_out.weight'] = vae_state_dict[
+        'encoder.conv_out.weight'
+    ]
+    new_checkpoint['encoder.conv_out.bias'] = vae_state_dict[
+        'encoder.conv_out.bias'
+    ]
+    new_checkpoint['encoder.conv_norm_out.weight'] = vae_state_dict[
+        'encoder.norm_out.weight'
+    ]
+    new_checkpoint['encoder.conv_norm_out.bias'] = vae_state_dict[
+        'encoder.norm_out.bias'
+    ]
+    new_checkpoint['decoder.conv_in.weight'] = vae_state_dict[
+        'decoder.conv_in.weight'
+    ]
+    new_checkpoint['decoder.conv_in.bias'] = vae_state_dict[
+        'decoder.conv_in.bias'
+    ]
+    new_checkpoint['decoder.conv_out.weight'] = vae_state_dict[
+        'decoder.conv_out.weight'
+    ]
+    new_checkpoint['decoder.conv_out.bias'] = vae_state_dict[
+        'decoder.conv_out.bias'
+    ]
+    new_checkpoint['decoder.conv_norm_out.weight'] = vae_state_dict[
+        'decoder.norm_out.weight'
+    ]
+    new_checkpoint['decoder.conv_norm_out.bias'] = vae_state_dict[
+        'decoder.norm_out.bias'
+    ]
+    new_checkpoint['quant_conv.weight'] = vae_state_dict['quant_conv.weight']
+    new_checkpoint['quant_conv.bias'] = vae_state_dict['quant_conv.bias']
+    new_checkpoint['post_quant_conv.weight'] = vae_state_dict[
+        'post_quant_conv.weight'
+    ]
+    new_checkpoint['post_quant_conv.bias'] = vae_state_dict[
+        'post_quant_conv.bias'
+    ]
+    # Retrieves the keys for the encoder down blocks only
+    num_down_blocks = len(
+        {
+            '.'.join(layer.split('.')[:3])
+            for layer in vae_state_dict
+            if 'encoder.down' in layer
+        }
+    )
+    down_blocks = {
+        layer_id: [key for key in vae_state_dict if f'down.{layer_id}' in key]
+        for layer_id in range(num_down_blocks)
+    }
+    # Retrieves the keys for the decoder up blocks only
+    num_up_blocks = len(
+        {
+            '.'.join(layer.split('.')[:3])
+            for layer in vae_state_dict
+            if 'decoder.up' in layer
+        }
+    )
+    up_blocks = {
+        layer_id: [key for key in vae_state_dict if f'up.{layer_id}' in key]
+        for layer_id in range(num_up_blocks)
+    }
+    for i in range(num_down_blocks):
+        resnets = [
+            key
+            for key in down_blocks[i]
+            if f'down.{i}' in key and f'down.{i}.downsample' not in key
+        ]
+        if f'encoder.down.{i}.downsample.conv.weight' in vae_state_dict:
+            new_checkpoint[
+                f'encoder.down_blocks.{i}.downsamplers.0.conv.weight'
+            ] = vae_state_dict.pop(f'encoder.down.{i}.downsample.conv.weight')
+            new_checkpoint[
+                f'encoder.down_blocks.{i}.downsamplers.0.conv.bias'
+            ] = vae_state_dict.pop(f'encoder.down.{i}.downsample.conv.bias')
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {
+            'old': f'down.{i}.block',
+            'new': f'down_blocks.{i}.resnets',
+        }
+        assign_to_checkpoint(
+            paths,
+            new_checkpoint,
+            vae_state_dict,
+            additional_replacements=[meta_path],
+            config=config,
+        )
+    mid_resnets = [key for key in vae_state_dict if 'encoder.mid.block' in key]
+    num_mid_res_blocks = 2
+    for i in range(1, num_mid_res_blocks + 1):
+        resnets = [
+            key for key in mid_resnets if f'encoder.mid.block_{i}' in key
+        ]
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {
+            'old': f'mid.block_{i}',
+            'new': f'mid_block.resnets.{i - 1}',
+        }
+        assign_to_checkpoint(
+            paths,
+            new_checkpoint,
+            vae_state_dict,
+            additional_replacements=[meta_path],
+            config=config,
+        )
+    mid_attentions = [
+        key for key in vae_state_dict if 'encoder.mid.attn' in key
+    ]
+    paths = renew_vae_attention_paths(mid_attentions)
+    meta_path = {'old': 'mid.attn_1', 'new': 'mid_block.attentions.0'}
+    assign_to_checkpoint(
+        paths,
+        new_checkpoint,
+        vae_state_dict,
+        additional_replacements=[meta_path],
+        config=config,
+    )
+    conv_attn_to_linear(new_checkpoint)
+    for i in range(num_up_blocks):
+        block_id = num_up_blocks - 1 - i
+        resnets = [
+            key
+            for key in up_blocks[block_id]
+            if f'up.{block_id}' in key and f'up.{block_id}.upsample' not in key
+        ]
+        if f'decoder.up.{block_id}.upsample.conv.weight' in vae_state_dict:
+            new_checkpoint[
+                f'decoder.up_blocks.{i}.upsamplers.0.conv.weight'
+            ] = vae_state_dict[f'decoder.up.{block_id}.upsample.conv.weight']
+            new_checkpoint[
+                f'decoder.up_blocks.{i}.upsamplers.0.conv.bias'
+            ] = vae_state_dict[f'decoder.up.{block_id}.upsample.conv.bias']
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {
+            'old': f'up.{block_id}.block',
+            'new': f'up_blocks.{i}.resnets',
+        }
+        assign_to_checkpoint(
+            paths,
+            new_checkpoint,
+            vae_state_dict,
+            additional_replacements=[meta_path],
+            config=config,
+        )
+    mid_resnets = [key for key in vae_state_dict if 'decoder.mid.block' in key]
+    num_mid_res_blocks = 2
+    for i in range(1, num_mid_res_blocks + 1):
+        resnets = [
+            key for key in mid_resnets if f'decoder.mid.block_{i}' in key
+        ]
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {
+            'old': f'mid.block_{i}',
+            'new': f'mid_block.resnets.{i - 1}',
+        }
+        assign_to_checkpoint(
+            paths,
+            new_checkpoint,
+            vae_state_dict,
+            additional_replacements=[meta_path],
+            config=config,
+        )
+    mid_attentions = [
+        key for key in vae_state_dict if 'decoder.mid.attn' in key
+    ]
+    paths = renew_vae_attention_paths(mid_attentions)
+    meta_path = {'old': 'mid.attn_1', 'new': 'mid_block.attentions.0'}
+    assign_to_checkpoint(
+        paths,
+        new_checkpoint,
+        vae_state_dict,
+        additional_replacements=[meta_path],
+        config=config,
+    )
+    conv_attn_to_linear(new_checkpoint)
+    return new_checkpoint
+def create_unet_diffusers_config(v2):
+    """
+    Creates a config for the diffusers based on the config of the LDM model.
+    """
+    # unet_params = original_config.model.params.unet_config.params
+    block_out_channels = [
+        UNET_PARAMS_MODEL_CHANNELS * mult for mult in UNET_PARAMS_CHANNEL_MULT
+    ]
+    down_block_types = []
+    resolution = 1
+    for i in range(len(block_out_channels)):
+        block_type = (
+            'CrossAttnDownBlock2D'
+            if resolution in UNET_PARAMS_ATTENTION_RESOLUTIONS
+            else 'DownBlock2D'
+        )
+        down_block_types.append(block_type)
+        if i != len(block_out_channels) - 1:
+            resolution *= 2
+    up_block_types = []
+    for i in range(len(block_out_channels)):
+        block_type = (
+            'CrossAttnUpBlock2D'
+            if resolution in UNET_PARAMS_ATTENTION_RESOLUTIONS
+            else 'UpBlock2D'
+        )
+        up_block_types.append(block_type)
+        resolution //= 2
+    config = dict(
+        sample_size=UNET_PARAMS_IMAGE_SIZE,
+        in_channels=UNET_PARAMS_IN_CHANNELS,
+        out_channels=UNET_PARAMS_OUT_CHANNELS,
+        down_block_types=tuple(down_block_types),
+        up_block_types=tuple(up_block_types),
+        block_out_channels=tuple(block_out_channels),
+        layers_per_block=UNET_PARAMS_NUM_RES_BLOCKS,
+        cross_attention_dim=UNET_PARAMS_CONTEXT_DIM
+        if not v2
+        else V2_UNET_PARAMS_CONTEXT_DIM,
+        attention_head_dim=UNET_PARAMS_NUM_HEADS
+        if not v2
+        else V2_UNET_PARAMS_ATTENTION_HEAD_DIM,
+    )
+    return config
+def create_vae_diffusers_config():
+    """
+    Creates a config for the diffusers based on the config of the LDM model.
+    """
+    # vae_params = original_config.model.params.first_stage_config.params.ddconfig
+    # _ = original_config.model.params.first_stage_config.params.embed_dim
+    block_out_channels = [VAE_PARAMS_CH * mult for mult in VAE_PARAMS_CH_MULT]
+    down_block_types = ['DownEncoderBlock2D'] * len(block_out_channels)
+    up_block_types = ['UpDecoderBlock2D'] * len(block_out_channels)
+    config = dict(
+        sample_size=VAE_PARAMS_RESOLUTION,
+        in_channels=VAE_PARAMS_IN_CHANNELS,
+        out_channels=VAE_PARAMS_OUT_CH,
+        down_block_types=tuple(down_block_types),
+        up_block_types=tuple(up_block_types),
+        block_out_channels=tuple(block_out_channels),
+        latent_channels=VAE_PARAMS_Z_CHANNELS,
+        layers_per_block=VAE_PARAMS_NUM_RES_BLOCKS,
+    )
+    return config
+def convert_ldm_clip_checkpoint_v1(checkpoint):
+    keys = list(checkpoint.keys())
+    text_model_dict = {}
+    for key in keys:
+        if key.startswith('cond_stage_model.transformer'):
+            text_model_dict[
+                key[len('cond_stage_model.transformer.') :]
+            ] = checkpoint[key]
+    return text_model_dict
+def convert_ldm_clip_checkpoint_v2(checkpoint, max_length):
+    # 嫌になるくらい違うぞ！
+    def convert_key(key):
+        if not key.startswith('cond_stage_model'):
+            return None
+        # common conversion
+        key = key.replace(
+            'cond_stage_model.model.transformer.', 'text_model.encoder.'
+        )
+        key = key.replace('cond_stage_model.model.', 'text_model.')
+        if 'resblocks' in key:
+            # resblocks conversion
+            key = key.replace('.resblocks.', '.layers.')
+            if '.ln_' in key:
+                key = key.replace('.ln_', '.layer_norm')
+            elif '.mlp.' in key:
+                key = key.replace('.c_fc.', '.fc1.')
+                key = key.replace('.c_proj.', '.fc2.')
+            elif '.attn.out_proj' in key:
+                key = key.replace('.attn.out_proj.', '.self_attn.out_proj.')
+            elif '.attn.in_proj' in key:
+                key = None                  # 特殊なので後で処理する
+            else:
+                raise ValueError(f'unexpected key in SD: {key}')
+        elif '.positional_embedding' in key:
+            key = key.replace(
+                '.positional_embedding',
+                '.embeddings.position_embedding.weight',
+            )
+        elif '.text_projection' in key:
+            key = None    # 使われない???
+        elif '.logit_scale' in key:
+            key = None    # 使われない???
+        elif '.token_embedding' in key:
+            key = key.replace(
+                '.token_embedding.weight', '.embeddings.token_embedding.weight'
+            )
+        elif '.ln_final' in key:
+            key = key.replace('.ln_final', '.final_layer_norm')
+        return key
+    keys = list(checkpoint.keys())
+    new_sd = {}
+    for key in keys:
+        # remove resblocks 23
+        if '.resblocks.23.' in key:
+            continue
+        new_key = convert_key(key)
+        if new_key is None:
+            continue
+        new_sd[new_key] = checkpoint[key]
+    # attnの変換
+    for key in keys:
+        if '.resblocks.23.' in key:
+            continue
+        if '.resblocks' in key and '.attn.in_proj_' in key:
+            # 三つに分割
+            values = torch.chunk(checkpoint[key], 3)
+            key_suffix = '.weight' if 'weight' in key else '.bias'
+            key_pfx = key.replace(
+                'cond_stage_model.model.transformer.resblocks.',
+                'text_model.encoder.layers.',
+            )
+            key_pfx = key_pfx.replace('_weight', '')
+            key_pfx = key_pfx.replace('_bias', '')
+            key_pfx = key_pfx.replace('.attn.in_proj', '.self_attn.')
+            new_sd[key_pfx + 'q_proj' + key_suffix] = values[0]
+            new_sd[key_pfx + 'k_proj' + key_suffix] = values[1]
+            new_sd[key_pfx + 'v_proj' + key_suffix] = values[2]
+    # position_idsの追加
+    new_sd['text_model.embeddings.position_ids'] = torch.Tensor(
+        [list(range(max_length))]
+    ).to(torch.int64)
+    return new_sd
+# endregion
+# region Diffusers->StableDiffusion の変換コード
+# convert_diffusers_to_original_stable_diffusion をコピーして修正している（ASL 2.0）
+def conv_transformer_to_linear(checkpoint):
+    keys = list(checkpoint.keys())
+    tf_keys = ['proj_in.weight', 'proj_out.weight']
+    for key in keys:
+        if '.'.join(key.split('.')[-2:]) in tf_keys:
+            if checkpoint[key].ndim > 2:
+                checkpoint[key] = checkpoint[key][:, :, 0, 0]
+def convert_unet_state_dict_to_sd(v2, unet_state_dict):
+    unet_conversion_map = [
+        # (stable-diffusion, HF Diffusers)
+        ('time_embed.0.weight', 'time_embedding.linear_1.weight'),
+        ('time_embed.0.bias', 'time_embedding.linear_1.bias'),
+        ('time_embed.2.weight', 'time_embedding.linear_2.weight'),
+        ('time_embed.2.bias', 'time_embedding.linear_2.bias'),
+        ('input_blocks.0.0.weight', 'conv_in.weight'),
+        ('input_blocks.0.0.bias', 'conv_in.bias'),
+        ('out.0.weight', 'conv_norm_out.weight'),
+        ('out.0.bias', 'conv_norm_out.bias'),
+        ('out.2.weight', 'conv_out.weight'),
+        ('out.2.bias', 'conv_out.bias'),
+    ]
+    unet_conversion_map_resnet = [
+        # (stable-diffusion, HF Diffusers)
+        ('in_layers.0', 'norm1'),
+        ('in_layers.2', 'conv1'),
+        ('out_layers.0', 'norm2'),
+        ('out_layers.3', 'conv2'),
+        ('emb_layers.1', 'time_emb_proj'),
+        ('skip_connection', 'conv_shortcut'),
+    ]
+    unet_conversion_map_layer = []
+    for i in range(4):
+        # loop over downblocks/upblocks
+        for j in range(2):
+            # loop over resnets/attentions for downblocks
+            hf_down_res_prefix = f'down_blocks.{i}.resnets.{j}.'
+            sd_down_res_prefix = f'input_blocks.{3*i + j + 1}.0.'
+            unet_conversion_map_layer.append(
+                (sd_down_res_prefix, hf_down_res_prefix)
+            )
+            if i < 3:
+                # no attention layers in down_blocks.3
+                hf_down_atn_prefix = f'down_blocks.{i}.attentions.{j}.'
+                sd_down_atn_prefix = f'input_blocks.{3*i + j + 1}.1.'
+                unet_conversion_map_layer.append(
+                    (sd_down_atn_prefix, hf_down_atn_prefix)
+                )
+        for j in range(3):
+            # loop over resnets/attentions for upblocks
+            hf_up_res_prefix = f'up_blocks.{i}.resnets.{j}.'
+            sd_up_res_prefix = f'output_blocks.{3*i + j}.0.'
+            unet_conversion_map_layer.append(
+                (sd_up_res_prefix, hf_up_res_prefix)
+            )
+            if i > 0:
+                # no attention layers in up_blocks.0
+                hf_up_atn_prefix = f'up_blocks.{i}.attentions.{j}.'
+                sd_up_atn_prefix = f'output_blocks.{3*i + j}.1.'
+                unet_conversion_map_layer.append(
+                    (sd_up_atn_prefix, hf_up_atn_prefix)
+                )
+        if i < 3:
+            # no downsample in down_blocks.3
+            hf_downsample_prefix = f'down_blocks.{i}.downsamplers.0.conv.'
+            sd_downsample_prefix = f'input_blocks.{3*(i+1)}.0.op.'
+            unet_conversion_map_layer.append(
+                (sd_downsample_prefix, hf_downsample_prefix)
+            )
+            # no upsample in up_blocks.3
+            hf_upsample_prefix = f'up_blocks.{i}.upsamplers.0.'
+            sd_upsample_prefix = (
+                f'output_blocks.{3*i + 2}.{1 if i == 0 else 2}.'
+            )
+            unet_conversion_map_layer.append(
+                (sd_upsample_prefix, hf_upsample_prefix)
+            )
+    hf_mid_atn_prefix = 'mid_block.attentions.0.'
+    sd_mid_atn_prefix = 'middle_block.1.'
+    unet_conversion_map_layer.append((sd_mid_atn_prefix, hf_mid_atn_prefix))
+    for j in range(2):
+        hf_mid_res_prefix = f'mid_block.resnets.{j}.'
+        sd_mid_res_prefix = f'middle_block.{2*j}.'
+        unet_conversion_map_layer.append(
+            (sd_mid_res_prefix, hf_mid_res_prefix)
+        )
+    # buyer beware: this is a *brittle* function,
+    # and correct output requires that all of these pieces interact in
+    # the exact order in which I have arranged them.
+    mapping = {k: k for k in unet_state_dict.keys()}
+    for sd_name, hf_name in unet_conversion_map:
+        mapping[hf_name] = sd_name
+    for k, v in mapping.items():
+        if 'resnets' in k:
+            for sd_part, hf_part in unet_conversion_map_resnet:
+                v = v.replace(hf_part, sd_part)
+            mapping[k] = v
+    for k, v in mapping.items():
+        for sd_part, hf_part in unet_conversion_map_layer:
+            v = v.replace(hf_part, sd_part)
+        mapping[k] = v
+    new_state_dict = {v: unet_state_dict[k] for k, v in mapping.items()}
+    if v2:
+        conv_transformer_to_linear(new_state_dict)
+    return new_state_dict
+# ================#
+# VAE Conversion #
+# ================#
+def reshape_weight_for_sd(w):
+    # convert HF linear weights to SD conv2d weights
+    return w.reshape(*w.shape, 1, 1)
+def convert_vae_state_dict(vae_state_dict):
+    vae_conversion_map = [
+        # (stable-diffusion, HF Diffusers)
+        ('nin_shortcut', 'conv_shortcut'),
+        ('norm_out', 'conv_norm_out'),
+        ('mid.attn_1.', 'mid_block.attentions.0.'),
+    ]
+    for i in range(4):
+        # down_blocks have two resnets
+        for j in range(2):
+            hf_down_prefix = f'encoder.down_blocks.{i}.resnets.{j}.'
+            sd_down_prefix = f'encoder.down.{i}.block.{j}.'
+            vae_conversion_map.append((sd_down_prefix, hf_down_prefix))
+        if i < 3:
+            hf_downsample_prefix = f'down_blocks.{i}.downsamplers.0.'
+            sd_downsample_prefix = f'down.{i}.downsample.'
+            vae_conversion_map.append(
+                (sd_downsample_prefix, hf_downsample_prefix)
+            )
+            hf_upsample_prefix = f'up_blocks.{i}.upsamplers.0.'
+            sd_upsample_prefix = f'up.{3-i}.upsample.'
+            vae_conversion_map.append((sd_upsample_prefix, hf_upsample_prefix))
+        # up_blocks have three resnets
+        # also, up blocks in hf are numbered in reverse from sd
+        for j in range(3):
+            hf_up_prefix = f'decoder.up_blocks.{i}.resnets.{j}.'
+            sd_up_prefix = f'decoder.up.{3-i}.block.{j}.'
+            vae_conversion_map.append((sd_up_prefix, hf_up_prefix))
+    # this part accounts for mid blocks in both the encoder and the decoder
+    for i in range(2):
+        hf_mid_res_prefix = f'mid_block.resnets.{i}.'
+        sd_mid_res_prefix = f'mid.block_{i+1}.'
+        vae_conversion_map.append((sd_mid_res_prefix, hf_mid_res_prefix))
+    vae_conversion_map_attn = [
+        # (stable-diffusion, HF Diffusers)
+        ('norm.', 'group_norm.'),
+        ('q.', 'query.'),
+        ('k.', 'key.'),
+        ('v.', 'value.'),
+        ('proj_out.', 'proj_attn.'),
+    ]
+    mapping = {k: k for k in vae_state_dict.keys()}
+    for k, v in mapping.items():
+        for sd_part, hf_part in vae_conversion_map:
+            v = v.replace(hf_part, sd_part)
+        mapping[k] = v
+    for k, v in mapping.items():
+        if 'attentions' in k:
+            for sd_part, hf_part in vae_conversion_map_attn:
+                v = v.replace(hf_part, sd_part)
+            mapping[k] = v
+    new_state_dict = {v: vae_state_dict[k] for k, v in mapping.items()}
+    weights_to_convert = ['q', 'k', 'v', 'proj_out']
+    for k, v in new_state_dict.items():
+        for weight_name in weights_to_convert:
+            if f'mid.attn_1.{weight_name}.weight' in k:
+                # print(f"Reshaping {k} for SD format")
+                new_state_dict[k] = reshape_weight_for_sd(v)
+    return new_state_dict
+# endregion
+# region 自作のモデル読み書きなど
+def is_safetensors(path):
+    return os.path.splitext(path)[1].lower() == '.safetensors'
+def load_checkpoint_with_text_encoder_conversion(ckpt_path):
+    # text encoderの格納形式が違うモデルに対応する ('text_model'がない)
+    TEXT_ENCODER_KEY_REPLACEMENTS = [
+        (
+            'cond_stage_model.transformer.embeddings.',
+            'cond_stage_model.transformer.text_model.embeddings.',
+        ),
+        (
+            'cond_stage_model.transformer.encoder.',
+            'cond_stage_model.transformer.text_model.encoder.',
+        ),
+        (
+            'cond_stage_model.transformer.final_layer_norm.',
+            'cond_stage_model.transformer.text_model.final_layer_norm.',
+        ),
+    ]
+    if is_safetensors(ckpt_path):
+        checkpoint = None
+        state_dict = load_file(ckpt_path, 'cpu')
+    else:
+        checkpoint = torch.load(ckpt_path, map_location='cpu')
+        if 'state_dict' in checkpoint:
+            state_dict = checkpoint['state_dict']
+        else:
+            state_dict = checkpoint
+            checkpoint = None
+    key_reps = []
+    for rep_from, rep_to in TEXT_ENCODER_KEY_REPLACEMENTS:
+        for key in state_dict.keys():
+            if key.startswith(rep_from):
+                new_key = rep_to + key[len(rep_from) :]
+                key_reps.append((key, new_key))
+    for key, new_key in key_reps:
+        state_dict[new_key] = state_dict[key]
+        del state_dict[key]
+    return checkpoint, state_dict
+# TODO dtype指定の動作が怪しいので確認する text_encoderを指定形式で作れるか未確認
+def load_models_from_stable_diffusion_checkpoint(v2, ckpt_path, dtype=None):
+    _, state_dict = load_checkpoint_with_text_encoder_conversion(ckpt_path)
+    if dtype is not None:
+        for k, v in state_dict.items():
+            if type(v) is torch.Tensor:
+                state_dict[k] = v.to(dtype)
+    # Convert the UNet2DConditionModel model.
+    unet_config = create_unet_diffusers_config(v2)
+    converted_unet_checkpoint = convert_ldm_unet_checkpoint(
+        v2, state_dict, unet_config
+    )
+    unet = UNet2DConditionModel(**unet_config)
+    info = unet.load_state_dict(converted_unet_checkpoint)
+    print('loading u-net:', info)
+    # Convert the VAE model.
+    vae_config = create_vae_diffusers_config()
+    converted_vae_checkpoint = convert_ldm_vae_checkpoint(
+        state_dict, vae_config
+    )
+    vae = AutoencoderKL(**vae_config)
+    info = vae.load_state_dict(converted_vae_checkpoint)
+    print('loadint vae:', info)
+    # convert text_model
+    if v2:
+        converted_text_encoder_checkpoint = convert_ldm_clip_checkpoint_v2(
+            state_dict, 77
+        )
+        cfg = CLIPTextConfig(
+            vocab_size=49408,
+            hidden_size=1024,
+            intermediate_size=4096,
+            num_hidden_layers=23,
+            num_attention_heads=16,
+            max_position_embeddings=77,
+            hidden_act='gelu',
+            layer_norm_eps=1e-05,
+            dropout=0.0,
+            attention_dropout=0.0,
+            initializer_range=0.02,
+            initializer_factor=1.0,
+            pad_token_id=1,
+            bos_token_id=0,
+            eos_token_id=2,
+            model_type='clip_text_model',
+            projection_dim=512,
+            torch_dtype='float32',
+            transformers_version='4.25.0.dev0',
+        )
+        text_model = CLIPTextModel._from_config(cfg)
+        info = text_model.load_state_dict(converted_text_encoder_checkpoint)
+    else:
+        converted_text_encoder_checkpoint = convert_ldm_clip_checkpoint_v1(
+            state_dict
+        )
+        text_model = CLIPTextModel.from_pretrained(
+            'openai/clip-vit-large-patch14'
+        )
+        info = text_model.load_state_dict(converted_text_encoder_checkpoint)
+    print('loading text encoder:', info)
+    return text_model, vae, unet
+def convert_text_encoder_state_dict_to_sd_v2(
+    checkpoint, make_dummy_weights=False
+):
+    def convert_key(key):
+        # position_idsの除去
+        if '.position_ids' in key:
+            return None
+        # common
+        key = key.replace('text_model.encoder.', 'transformer.')
+        key = key.replace('text_model.', '')
+        if 'layers' in key:
+            # resblocks conversion
+            key = key.replace('.layers.', '.resblocks.')
+            if '.layer_norm' in key:
+                key = key.replace('.layer_norm', '.ln_')
+            elif '.mlp.' in key:
+                key = key.replace('.fc1.', '.c_fc.')
+                key = key.replace('.fc2.', '.c_proj.')
+            elif '.self_attn.out_proj' in key:
+                key = key.replace('.self_attn.out_proj.', '.attn.out_proj.')
+            elif '.self_attn.' in key:
+                key = None                  # 特殊なので後で処理する
+            else:
+                raise ValueError(f'unexpected key in DiffUsers model: {key}')
+        elif '.position_embedding' in key:
+            key = key.replace(
+                'embeddings.position_embedding.weight', 'positional_embedding'
+            )
+        elif '.token_embedding' in key:
+            key = key.replace(
+                'embeddings.token_embedding.weight', 'token_embedding.weight'
+            )
+        elif 'final_layer_norm' in key:
+            key = key.replace('final_layer_norm', 'ln_final')
+        return key
+    keys = list(checkpoint.keys())
+    new_sd = {}
+    for key in keys:
+        new_key = convert_key(key)
+        if new_key is None:
+            continue
+        new_sd[new_key] = checkpoint[key]
+    # attnの変換
+    for key in keys:
+        if 'layers' in key and 'q_proj' in key:
+            # 三つを結合
+            key_q = key
+            key_k = key.replace('q_proj', 'k_proj')
+            key_v = key.replace('q_proj', 'v_proj')
+            value_q = checkpoint[key_q]
+            value_k = checkpoint[key_k]
+            value_v = checkpoint[key_v]
+            value = torch.cat([value_q, value_k, value_v])
+            new_key = key.replace(
+                'text_model.encoder.layers.', 'transformer.resblocks.'
+            )
+            new_key = new_key.replace('.self_attn.q_proj.', '.attn.in_proj_')
+            new_sd[new_key] = value
+    # 最後の層などを捏造するか
+    if make_dummy_weights:
+        print(
+            'make dummy weights for resblock.23, text_projection and logit scale.'
+        )
+        keys = list(new_sd.keys())
+        for key in keys:
+            if key.startswith('transformer.resblocks.22.'):
+                new_sd[key.replace('.22.', '.23.')] = new_sd[
+                    key
+                ].clone()          # copyしないとsafetensorsの保存で落ちる
+        # Diffusersに含まれない重みを作っておく
+        new_sd['text_projection'] = torch.ones(
+            (1024, 1024),
+            dtype=new_sd[keys[0]].dtype,
+            device=new_sd[keys[0]].device,
+        )
+        new_sd['logit_scale'] = torch.tensor(1)
+    return new_sd
+def save_stable_diffusion_checkpoint(
+    v2,
+    output_file,
+    text_encoder,
+    unet,
+    ckpt_path,
+    epochs,
+    steps,
+    save_dtype=None,
+    vae=None,
+):
+    if ckpt_path is not None:
+        # epoch/stepを参照する。またVAEがメモリ上にないときなど、もう一度VAEを含めて読み込む
+        checkpoint, state_dict = load_checkpoint_with_text_encoder_conversion(
+            ckpt_path
+        )
+        if checkpoint is None:                # safetensors または state_dictのckpt
+            checkpoint = {}
+            strict = False
+        else:
+            strict = True
+        if 'state_dict' in state_dict:
+            del state_dict['state_dict']
+    else:
+        # 新しく作る
+        assert (
+            vae is not None
+        ), 'VAE is required to save a checkpoint without a given checkpoint'
+        checkpoint = {}
+        state_dict = {}
+        strict = False
+    def update_sd(prefix, sd):
+        for k, v in sd.items():
+            key = prefix + k
+            assert (
+                not strict or key in state_dict
+            ), f'Illegal key in save SD: {key}'
+            if save_dtype is not None:
+                v = v.detach().clone().to('cpu').to(save_dtype)
+            state_dict[key] = v
+    # Convert the UNet model
+    unet_state_dict = convert_unet_state_dict_to_sd(v2, unet.state_dict())
+    update_sd('model.diffusion_model.', unet_state_dict)
+    # Convert the text encoder model
+    if v2:
+        make_dummy = (
+            ckpt_path is None
+        )                 # 参照元のcheckpointがない場合は最後の層を前の層から複製して作るなどダミーの重みを入れる
+        text_enc_dict = convert_text_encoder_state_dict_to_sd_v2(
+            text_encoder.state_dict(), make_dummy
+        )
+        update_sd('cond_stage_model.model.', text_enc_dict)
+    else:
+        text_enc_dict = text_encoder.state_dict()
+        update_sd('cond_stage_model.transformer.', text_enc_dict)
+    # Convert the VAE
+    if vae is not None:
+        vae_dict = convert_vae_state_dict(vae.state_dict())
+        update_sd('first_stage_model.', vae_dict)
+    # Put together new checkpoint
+    key_count = len(state_dict.keys())
+    new_ckpt = {'state_dict': state_dict}
+    if 'epoch' in checkpoint:
+        epochs += checkpoint['epoch']
+    if 'global_step' in checkpoint:
+        steps += checkpoint['global_step']
+    new_ckpt['epoch'] = epochs
+    new_ckpt['global_step'] = steps
+    if is_safetensors(output_file):
+        # TODO Tensor以外のdictの値を削除したほうがいいか
+        save_file(state_dict, output_file)
+    else:
+        torch.save(new_ckpt, output_file)
+    return key_count
+def save_diffusers_checkpoint(
+    v2,
+    output_dir,
+    text_encoder,
+    unet,
+    pretrained_model_name_or_path,
+    vae=None,
+    use_safetensors=False,
+):
+    if pretrained_model_name_or_path is None:
+        # load default settings for v1/v2
+        if v2:
+            pretrained_model_name_or_path = DIFFUSERS_REF_MODEL_ID_V2
+        else:
+            pretrained_model_name_or_path = DIFFUSERS_REF_MODEL_ID_V1
+    scheduler = DDIMScheduler.from_pretrained(
+        pretrained_model_name_or_path, subfolder='scheduler'
+    )
+    tokenizer = CLIPTokenizer.from_pretrained(
+        pretrained_model_name_or_path, subfolder='tokenizer'
+    )
+    if vae is None:
+        vae = AutoencoderKL.from_pretrained(
+            pretrained_model_name_or_path, subfolder='vae'
+        )
+    pipeline = StableDiffusionPipeline(
+        unet=unet,
+        text_encoder=text_encoder,
+        vae=vae,
+        scheduler=scheduler,
+        tokenizer=tokenizer,
+        safety_checker=None,
+        feature_extractor=None,
+        requires_safety_checker=None,
+    )
+    pipeline.save_pretrained(output_dir, safe_serialization=use_safetensors)
+VAE_PREFIX = 'first_stage_model.'
+def load_vae(vae_id, dtype):
+    print(f'load VAE: {vae_id}')
+    if os.path.isdir(vae_id) or not os.path.isfile(vae_id):
+        # Diffusers local/remote
+        try:
+            vae = AutoencoderKL.from_pretrained(
+                vae_id, subfolder=None, torch_dtype=dtype
+            )
+        except EnvironmentError as e:
+            print(f'exception occurs in loading vae: {e}')
+            print("retry with subfolder='vae'")
+            vae = AutoencoderKL.from_pretrained(
+                vae_id, subfolder='vae', torch_dtype=dtype
+            )
+        return vae
+    # local
+    vae_config = create_vae_diffusers_config()
+    if vae_id.endswith('.bin'):
+        # SD 1.5 VAE on Huggingface
+        vae_sd = torch.load(vae_id, map_location='cpu')
+        converted_vae_checkpoint = vae_sd
+    else:
+        # StableDiffusion
+        vae_model = torch.load(vae_id, map_location='cpu')
+        vae_sd = vae_model['state_dict']
+        # vae only or full model
+        full_model = False
+        for vae_key in vae_sd:
+            if vae_key.startswith(VAE_PREFIX):
+                full_model = True
+                break
+        if not full_model:
+            sd = {}
+            for key, value in vae_sd.items():
+                sd[VAE_PREFIX + key] = value
+            vae_sd = sd
+            del sd
+        # Convert the VAE model.
+        converted_vae_checkpoint = convert_ldm_vae_checkpoint(
+            vae_sd, vae_config
+        )
+    vae = AutoencoderKL(**vae_config)
+    vae.load_state_dict(converted_vae_checkpoint)
+    return vae
+def get_epoch_ckpt_name(use_safetensors, epoch):
+    return f'epoch-{epoch:06d}' + (
+        '.safetensors' if use_safetensors else '.ckpt'
+    )
+def get_last_ckpt_name(use_safetensors):
+    return f'last' + ('.safetensors' if use_safetensors else '.ckpt')
+# endregion
+def make_bucket_resolutions(
+    max_reso, min_size=256, max_size=1024, divisible=64
+):
+    max_width, max_height = max_reso
+    max_area = (max_width // divisible) * (max_height // divisible)
+    resos = set()
+    size = int(math.sqrt(max_area)) * divisible
+    resos.add((size, size))
+    size = min_size
+    while size <= max_size:
+        width = size
+        height = min(max_size, (max_area // (width // divisible)) * divisible)
+        resos.add((width, height))
+        resos.add((height, width))
+        # # make additional resos
+        # if width >= height and width - divisible >= min_size:
+        #   resos.add((width - divisible, height))
+        #   resos.add((height, width - divisible))
+        # if height >= width and height - divisible >= min_size:
+        #   resos.add((width, height - divisible))
+        #   resos.add((height - divisible, width))
+        size += divisible
+    resos = list(resos)
+    resos.sort()
+    aspect_ratios = [w / h for w, h in resos]
+    return resos, aspect_ratios
+if __name__ == '__main__':
+    resos, aspect_ratios = make_bucket_resolutions((512, 768))
+    print(len(resos))
+    print(resos)
+    print(aspect_ratios)
+    ars = set()
+    for ar in aspect_ratios:
+        if ar in ars:
+            print('error! duplicate ar:', ar)
+        ars.add(ar)

StableTuner_RunPod_Fix/trainer.py ADDED Viewed

	@@ -0,0 +1,1750 @@

+"""
+Copyright 2022 HuggingFace, ShivamShrirao
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import keyboard
+import gradio as gr
+import argparse
+import random
+import hashlib
+import itertools
+import json
+import math
+import os
+import copy
+from contextlib import nullcontext
+from pathlib import Path
+import shutil
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+import numpy as np
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import set_seed
+from diffusers import AutoencoderKL, DDIMScheduler, DDPMScheduler, DiffusionPipeline, UNet2DConditionModel,DiffusionPipeline, DPMSolverMultistepScheduler,EulerDiscreteScheduler
+from diffusers.optimization import get_scheduler
+from torchvision.transforms import functional
+from tqdm.auto import tqdm
+from transformers import CLIPTextModel, CLIPTokenizer
+from typing import Dict, List, Generator, Tuple
+from PIL import Image, ImageFile
+from diffusers.utils.import_utils import is_xformers_available
+from trainer_util import *
+from dataloaders_util import *
+from discriminator import Discriminator2D
+from lion_pytorch import Lion
+logger = get_logger(__name__)
+def parse_args():
+    parser = argparse.ArgumentParser(description="Simple example of a training script.")
+    parser.add_argument(
+        "--revision",
+        type=str,
+        default=None,
+        required=False,
+        help="Revision of pretrained model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--attention",
+        type=str,
+        choices=["xformers", "flash_attention"],
+        default="xformers",
+        help="Type of attention to use."
+    )
+    parser.add_argument(
+        "--model_variant",
+        type=str,
+        default='base',
+        required=False,
+        help="Train Base/Inpaint/Depth2Img",
+    )
+    parser.add_argument(
+        "--aspect_mode",
+        type=str,
+        default='dynamic',
+        required=False,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--aspect_mode_action_preference",
+        type=str,
+        default='add',
+        required=False,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument('--use_lion',default=False,action="store_true", help='Use the new LION optimizer')
+    parser.add_argument('--use_ema',default=False,action="store_true", help='Use EMA for finetuning')
+    parser.add_argument('--clip_penultimate',default=False,action="store_true", help='Use penultimate CLIP layer for text embedding')
+    parser.add_argument("--conditional_dropout", type=float, default=None,required=False, help="Conditional dropout probability")
+    parser.add_argument('--disable_cudnn_benchmark', default=False, action="store_true")
+    parser.add_argument('--use_text_files_as_captions', default=False, action="store_true")
+    parser.add_argument(
+            "--sample_from_batch",
+            type=int,
+            default=0,
+            help=("Number of prompts to sample from the batch for inference"),
+        )
+    parser.add_argument(
+        "--flatten_sample_folder",
+        default=True,
+        action="store_true",
+        help="Will save samples in one folder instead of per-epoch",
+    )
+    parser.add_argument(
+            "--stop_text_encoder_training",
+            type=int,
+            default=999999999999999,
+            help=("The epoch at which the text_encoder is no longer trained"),
+        )
+    parser.add_argument(
+        "--use_bucketing",
+        default=False,
+        action="store_true",
+        help="Will save and generate samples before training",
+    )
+    parser.add_argument(
+        "--regenerate_latent_cache",
+        default=False,
+        action="store_true",
+        help="Will save and generate samples before training",
+    )
+    parser.add_argument(
+        "--sample_on_training_start",
+        default=False,
+        action="store_true",
+        help="Will save and generate samples before training",
+    )
+    parser.add_argument(
+        "--add_class_images_to_dataset",
+        default=False,
+        action="store_true",
+        help="will generate and add class images to the dataset without using prior reservation in training",
+    )
+    parser.add_argument(
+        "--auto_balance_concept_datasets",
+        default=False,
+        action="store_true",
+        help="will balance the number of images in each concept dataset to match the minimum number of images in any concept dataset",
+    )
+    parser.add_argument(
+        "--sample_aspect_ratios",
+        default=False,
+        action="store_true",
+        help="sample different aspect ratios for each image",
+    )
+    parser.add_argument(
+        "--dataset_repeats",
+        type=int,
+        default=1,
+        help="repeat the dataset this many times",
+    )
+    parser.add_argument(
+        "--save_every_n_epoch",
+        type=int,
+        default=1,
+        help="save on epoch finished",
+    )
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--pretrained_vae_name_or_path",
+        type=str,
+        default=None,
+        help="Path to pretrained vae or vae identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--tokenizer_name",
+        type=str,
+        default=None,
+        help="Pretrained tokenizer name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--instance_data_dir",
+        type=str,
+        default=None,
+        help="A folder containing the training data of instance images.",
+    )
+    parser.add_argument(
+        "--class_data_dir",
+        type=str,
+        default=None,
+        help="A folder containing the training data of class images.",
+    )
+    parser.add_argument(
+        "--instance_prompt",
+        type=str,
+        default=None,
+        help="The prompt with identifier specifying the instance",
+    )
+    parser.add_argument(
+        "--class_prompt",
+        type=str,
+        default=None,
+        help="The prompt to specify images in the same class as provided instance images.",
+    )
+    parser.add_argument(
+        "--save_sample_prompt",
+        type=str,
+        default=None,
+        help="The prompt used to generate sample outputs to save.",
+    )
+    parser.add_argument(
+        "--n_save_sample",
+        type=int,
+        default=4,
+        help="The number of samples to save.",
+    )
+    parser.add_argument(
+        "--sample_height",
+        type=int,
+        default=512,
+        help="The number of samples to save.",
+    )
+    parser.add_argument(
+        "--sample_width",
+        type=int,
+        default=512,
+        help="The number of samples to save.",
+    )
+    parser.add_argument(
+        "--save_guidance_scale",
+        type=float,
+        default=7.5,
+        help="CFG for save sample.",
+    )
+    parser.add_argument(
+        "--save_infer_steps",
+        type=int,
+        default=30,
+        help="The number of inference steps for save sample.",
+    )
+    parser.add_argument(
+        "--with_prior_preservation",
+        default=False,
+        action="store_true",
+        help="Flag to add prior preservation loss.",
+    )
+    parser.add_argument("--prior_loss_weight", type=float, default=1.0, help="The weight of prior preservation loss.")
+    parser.add_argument(
+        "--with_offset_noise",
+        default=False,
+        action="store_true",
+        help="Flag to offset noise applied to latents.",
+    )
+    parser.add_argument("--offset_noise_weight", type=float, default=0.1, help="The weight of offset noise applied during training.")
+    parser.add_argument(
+        "--num_class_images",
+        type=int,
+        default=100,
+        help=(
+            "Minimal class images for prior preservation loss. If not have enough images, additional images will be"
+            " sampled with class_prompt."
+        ),
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="text-inversion-model",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--resolution",
+        type=int,
+        default=512,
+        help=(
+            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
+            " resolution"
+        ),
+    )
+    parser.add_argument(
+        "--center_crop", default=False, action="store_true", help="Whether to center crop images before resizing to resolution"
+    )
+    parser.add_argument("--train_text_encoder", default=False, action="store_true", help="Whether to train the text encoder")
+    parser.add_argument(
+        "--train_batch_size", type=int, default=4, help="Batch size (per device) for the training dataloader."
+    )
+    parser.add_argument(
+        "--sample_batch_size", type=int, default=4, help="Batch size (per device) for sampling images."
+    )
+    parser.add_argument("--num_train_epochs", type=int, default=1)
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--gradient_checkpointing",
+        default=False,
+        action="store_true",
+        help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=5e-6,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument(
+        "--scale_lr",
+        action="store_true",
+        default=False,
+        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="constant",
+        help=(
+            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
+    parser.add_argument(
+        "--lr_warmup_steps", type=float, default=500, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument(
+        "--use_8bit_adam", default=False, action="store_true", help="Whether or not to use 8-bit Adam from bitsandbytes."
+    )
+    parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
+    parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument("--push_to_hub", default=False, action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--hub_model_id",
+        type=str,
+        default=None,
+        help="The name of the repository to keep in sync with the local `output_dir`.",
+    )
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help=(
+            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+        ),
+    )
+    parser.add_argument("--log_interval", type=int, default=10, help="Log every N steps.")
+    parser.add_argument("--sample_step_interval", type=int, default=100000000000000, help="Sample images every N steps.")
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default="no",
+        choices=["no", "fp16", "bf16","tf32"],
+        help=(
+            "Whether to use mixed precision. Choose"
+            "between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10."
+            "and an Nvidia Ampere GPU."
+        ),
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+    parser.add_argument(
+        "--concepts_list",
+        type=str,
+        default=None,
+        help="Path to json containing multiple concepts, will overwrite parameters like instance_prompt, class_prompt, etc.",
+    )
+    parser.add_argument("--save_sample_controlled_seed", type=int, action='append', help="Set a seed for an extra sample image to be constantly saved.")
+    parser.add_argument("--detect_full_drive", default=True, action="store_true", help="Delete checkpoints when the drive is full.")
+    parser.add_argument("--send_telegram_updates", default=False, action="store_true", help="Send Telegram updates.")
+    parser.add_argument("--telegram_chat_id", type=str, default="0", help="Telegram chat ID.")
+    parser.add_argument("--telegram_token", type=str, default="0", help="Telegram token.")
+    parser.add_argument("--use_deepspeed_adam", default=False, action="store_true", help="Use experimental DeepSpeed Adam 8.")
+    parser.add_argument('--append_sample_controlled_seed_action', action='append')
+    parser.add_argument('--add_sample_prompt', type=str, action='append')
+    parser.add_argument('--use_image_names_as_captions', default=False, action="store_true")
+    parser.add_argument('--shuffle_captions', default=False, action="store_true")
+    parser.add_argument("--masked_training", default=False, required=False, action='store_true', help="Whether to mask parts of the image during training")
+    parser.add_argument("--normalize_masked_area_loss", default=False, required=False, action='store_true', help="Normalize the loss, to make it independent of the size of the masked area")
+    parser.add_argument("--unmasked_probability", type=float, default=1, required=False, help="Probability of training a step without a mask")
+    parser.add_argument("--max_denoising_strength", type=float, default=1, required=False, help="Max denoising steps to train on")
+    parser.add_argument('--add_mask_prompt', type=str, default=None, action="append", dest="mask_prompts", help="Prompt for automatic mask creation")
+    parser.add_argument('--with_gan', default=False, action="store_true", help="Use GAN (experimental)")
+    parser.add_argument("--gan_weight", type=float, default=0.2, required=False, help="Strength of effect GAN has on training")
+    parser.add_argument("--gan_warmup", type=float, default=0, required=False, help="Slowly increases GAN weight from zero over this many steps, useful when initializing a GAN discriminator from scratch")
+    parser.add_argument('--discriminator_config', default="configs/discriminator_large.json", help="Location of config file to use when initializing a new GAN discriminator")
+    parser.add_argument('--sample_from_ema', default=True, action="store_true", help="Generate sample images using the EMA model")
+    parser.add_argument('--run_name', type=str, default=None, help="Adds a custom identifier to the sample and checkpoint directories")
+    args = parser.parse_args()
+    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
+    if env_local_rank != -1 and env_local_rank != args.local_rank:
+        args.local_rank = env_local_rank
+    return args
+def main():
+    print(f" {bcolors.OKBLUE}Booting Up StableTuner{bcolors.ENDC}")
+    print(f" {bcolors.OKBLUE}Please wait a moment as we load up some stuff...{bcolors.ENDC}")
+    #torch.cuda.set_per_process_memory_fraction(0.5)
+    args = parse_args()
+    #temp arg
+    args.batch_tokens = None
+    if args.disable_cudnn_benchmark:
+        torch.backends.cudnn.benchmark = False
+    else:
+        torch.backends.cudnn.benchmark = True
+    if args.send_telegram_updates:
+        send_telegram_message(f"Booting up StableTuner!\n", args.telegram_chat_id, args.telegram_token)
+    logging_dir = Path(args.output_dir, "logs", args.logging_dir)
+    if args.run_name:
+        main_sample_dir = os.path.join(args.output_dir, f"samples_{args.run_name}")
+    else:
+        main_sample_dir = os.path.join(args.output_dir, "samples")
+    if os.path.exists(main_sample_dir):
+            shutil.rmtree(main_sample_dir)
+            os.makedirs(main_sample_dir)
+    #create logging directory
+    if not logging_dir.exists():
+        logging_dir.mkdir(parents=True)
+    #create output directory
+    if not Path(args.output_dir).exists():
+        Path(args.output_dir).mkdir(parents=True)
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        mixed_precision=args.mixed_precision if args.mixed_precision != 'tf32' else 'no',
+        log_with="tensorboard",
+        logging_dir=logging_dir,
+    )
+    # Currently, it's not possible to do gradient accumulation when training two models with accelerate.accumulate
+    # This will be enabled soon in accelerate. For now, we don't allow gradient accumulation when training two models.
+    # TODO (patil-suraj): Remove this check when gradient accumulation with two models is enabled in accelerate.
+    if args.train_text_encoder and args.gradient_accumulation_steps > 1 and accelerator.num_processes > 1:
+        raise ValueError(
+            "Gradient accumulation is not supported when training the text encoder in distributed training. "
+            "Please set gradient_accumulation_steps to 1. This feature will be supported in the future."
+        )
+    if args.seed is not None:
+        set_seed(args.seed)
+    if args.concepts_list is None:
+        args.concepts_list = [
+            {
+                "instance_prompt": args.instance_prompt,
+                "class_prompt": args.class_prompt,
+                "instance_data_dir": args.instance_data_dir,
+                "class_data_dir": args.class_data_dir
+            }
+        ]
+    else:
+        with open(args.concepts_list, "r") as f:
+            args.concepts_list = json.load(f)
+    if args.with_prior_preservation or args.add_class_images_to_dataset:
+        pipeline = None
+        for concept in args.concepts_list:
+            class_images_dir = Path(concept["class_data_dir"])
+            class_images_dir.mkdir(parents=True, exist_ok=True)
+            cur_class_images = len(list(class_images_dir.iterdir()))
+            if cur_class_images < args.num_class_images:
+                torch_dtype = torch.float16 if accelerator.device.type == "cuda" else torch.float32
+                if pipeline is None:
+                    pipeline = DiffusionPipeline.from_pretrained(
+                        args.pretrained_model_name_or_path,
+                        safety_checker=None,
+                        vae=AutoencoderKL.from_pretrained(args.pretrained_vae_name_or_path or args.pretrained_model_name_or_path,subfolder=None if args.pretrained_vae_name_or_path else "vae" ,safe_serialization=True),
+                        torch_dtype=torch_dtype,
+                        requires_safety_checker=False,
+                    )
+                    pipeline.set_progress_bar_config(disable=True)
+                    pipeline.to(accelerator.device)
+                #if args.use_bucketing == False:
+                num_new_images = args.num_class_images - cur_class_images
+                logger.info(f"Number of class images to sample: {num_new_images}.")
+                sample_dataset = PromptDataset(concept["class_prompt"], num_new_images)
+                sample_dataloader = torch.utils.data.DataLoader(sample_dataset, batch_size=args.sample_batch_size)
+                sample_dataloader = accelerator.prepare(sample_dataloader)
+                #else:
+                    #create class images that match up to the concept target buckets
+                #    instance_images_dir = Path(concept["instance_data_dir"])
+                #    cur_instance_images = len(list(instance_images_dir.iterdir()))
+                    #target_wh = min(self.aspects, key=lambda aspects:abs(aspects[0]/aspects[1] - image_aspect))
+                #    num_new_images = cur_instance_images - cur_class_images
+                with torch.autocast("cuda"):
+                    for example in tqdm(
+                        sample_dataloader, desc="Generating class images", disable=not accelerator.is_local_main_process
+                    ):
+                        with torch.autocast("cuda"):
+                            images = pipeline(example["prompt"],height=args.resolution,width=args.resolution).images
+                        for i, image in enumerate(images):
+                            hash_image = hashlib.sha1(image.tobytes()).hexdigest()
+                            image_filename = class_images_dir / f"{example['index'][i] + cur_class_images}-{hash_image}.jpg"
+                            image.save(image_filename)
+        del pipeline
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+            torch.cuda.ipc_collect()
+    # Load the tokenizer
+    if args.tokenizer_name:
+        tokenizer = CLIPTokenizer.from_pretrained(args.tokenizer_name )
+    elif args.pretrained_model_name_or_path:
+        #print(os.getcwd())
+        tokenizer = CLIPTokenizer.from_pretrained(args.pretrained_model_name_or_path, subfolder="tokenizer" )
+    # Load models and create wrapper for stable diffusion
+    #text_encoder = CLIPTextModel.from_pretrained(args.pretrained_model_name_or_path, subfolder="text_encoder" )
+    text_encoder = CLIPTextModel.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="text_encoder",
+        revision=args.revision,
+    )
+    vae = AutoencoderKL.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="vae",
+        revision=args.revision,
+    )
+    unet = UNet2DConditionModel.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="unet",
+        revision=args.revision,
+        torch_dtype=torch.float32
+    )
+    if args.with_gan:
+        if os.path.isdir(os.path.join(args.pretrained_model_name_or_path, "discriminator")):
+            discriminator = Discriminator2D.from_pretrained(
+                args.pretrained_model_name_or_path,
+                subfolder="discriminator",
+                revision=args.revision,
+            )
+        else:
+            print(f" {bcolors.WARNING}Discriminator network (GAN) not found. Initializing a new network. It may take a very large number of steps to train.{bcolors.ENDC}")
+            if not args.gan_warmup:
+                print(f" {bcolors.WARNING}Consider using --gan_warmup to stabilize the model while the discriminator is being trained.{bcolors.ENDC}")
+            with open(args.discriminator_config, "r") as f:
+                discriminator_config = json.load(f)
+            discriminator = Discriminator2D.from_config(discriminator_config)
+    if is_xformers_available() and args.attention=='xformers':
+        try:
+            vae.enable_xformers_memory_efficient_attention()
+            unet.enable_xformers_memory_efficient_attention()
+            if args.with_gan:
+                discriminator.enable_xformers_memory_efficient_attention()
+        except Exception as e:
+            logger.warning(
+                "Could not enable memory efficient attention. Make sure xformers is installed"
+                f" correctly and a GPU is available: {e}"
+            )
+    elif args.attention=='flash_attention':
+        replace_unet_cross_attn_to_flash_attention()
+    if args.use_ema == True:
+        if os.path.isdir(os.path.join(args.pretrained_model_name_or_path, "unet_ema")):
+            ema_unet = UNet2DConditionModel.from_pretrained(
+                args.pretrained_model_name_or_path,
+                subfolder="unet_ema",
+                revision=args.revision,
+                torch_dtype=torch.float32
+            )
+        else:
+            ema_unet = copy.deepcopy(unet)
+            ema_unet.config["step"] = 0
+        for param in ema_unet.parameters():
+            param.requires_grad = False
+    if args.model_variant == "depth2img":
+        d2i = Depth2Img(unet,text_encoder,args.mixed_precision,args.pretrained_model_name_or_path,accelerator)
+    vae.requires_grad_(False)
+    vae.enable_slicing()
+    if not args.train_text_encoder:
+        text_encoder.requires_grad_(False)
+    if args.gradient_checkpointing:
+        unet.enable_gradient_checkpointing()
+        if args.train_text_encoder:
+            text_encoder.gradient_checkpointing_enable()
+        if args.with_gan:
+            discriminator.enable_gradient_checkpointing()
+    if args.scale_lr:
+        args.learning_rate = (
+            args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
+        )
+    # Use 8-bit Adam for lower memory usage or to fine-tune the model in 16GB GPUs
+    if args.use_8bit_adam and args.use_deepspeed_adam==False and args.use_lion==False:
+        try:
+            import bitsandbytes as bnb
+        except ImportError:
+            raise ImportError(
+                "To use 8-bit Adam, please install the bitsandbytes library: `pip install bitsandbytes`."
+            )
+        optimizer_class = bnb.optim.AdamW8bit
+        print("Using 8-bit Adam")
+    elif args.use_8bit_adam and args.use_deepspeed_adam==True:
+        try:
+            from deepspeed.ops.adam import DeepSpeedCPUAdam
+        except ImportError:
+            raise ImportError(
+                "To use 8-bit DeepSpeed Adam, try updating your cuda and deepspeed integrations."
+            )
+        optimizer_class = DeepSpeedCPUAdam
+    elif args.use_lion == True:
+        print("Using LION optimizer")
+        optimizer_class = Lion
+    elif args.use_deepspeed_adam==False and args.use_lion==False and args.use_8bit_adam==False:
+        optimizer_class = torch.optim.AdamW
+    params_to_optimize = (
+        itertools.chain(unet.parameters(), text_encoder.parameters()) if args.train_text_encoder else unet.parameters()
+    )
+    if args.use_lion == False:
+        optimizer = optimizer_class(
+            params_to_optimize,
+            lr=args.learning_rate,
+            betas=(args.adam_beta1, args.adam_beta2),
+            weight_decay=args.adam_weight_decay,
+            eps=args.adam_epsilon,
+        )
+        if args.with_gan:
+            optimizer_discriminator = optimizer_class(
+                discriminator.parameters(),
+                lr=args.learning_rate,
+                betas=(args.adam_beta1, args.adam_beta2),
+                weight_decay=args.adam_weight_decay,
+                eps=args.adam_epsilon,
+            )
+    else:
+        optimizer = optimizer_class(
+            params_to_optimize,
+            lr=args.learning_rate,
+            betas=(args.adam_beta1, args.adam_beta2),
+            weight_decay=args.adam_weight_decay,
+            #eps=args.adam_epsilon,
+        )
+        if args.with_gan:
+            optimizer_discriminator = optimizer_class(
+                discriminator.parameters(),
+                lr=args.learning_rate,
+                betas=(args.adam_beta1, args.adam_beta2),
+                weight_decay=args.adam_weight_decay,
+                #eps=args.adam_epsilon,
+            )
+    noise_scheduler = DDPMScheduler.from_config(args.pretrained_model_name_or_path, subfolder="scheduler")
+    if args.use_bucketing:
+        train_dataset = AutoBucketing(
+            concepts_list=args.concepts_list,
+            use_image_names_as_captions=args.use_image_names_as_captions,
+            shuffle_captions=args.shuffle_captions,
+            batch_size=args.train_batch_size,
+            tokenizer=tokenizer,
+            add_class_images_to_dataset=args.add_class_images_to_dataset,
+            balance_datasets=args.auto_balance_concept_datasets,
+            resolution=args.resolution,
+            with_prior_loss=False,#args.with_prior_preservation,
+            repeats=args.dataset_repeats,
+            use_text_files_as_captions=args.use_text_files_as_captions,
+            aspect_mode=args.aspect_mode,
+            action_preference=args.aspect_mode_action_preference,
+            seed=args.seed,
+            model_variant=args.model_variant,
+            extra_module=None if args.model_variant != "depth2img" else d2i,
+            mask_prompts=args.mask_prompts,
+            load_mask=args.masked_training,
+        )
+    else:
+        train_dataset = NormalDataset(
+        concepts_list=args.concepts_list,
+        tokenizer=tokenizer,
+        with_prior_preservation=args.with_prior_preservation,
+        size=args.resolution,
+        center_crop=args.center_crop,
+        num_class_images=args.num_class_images,
+        use_image_names_as_captions=args.use_image_names_as_captions,
+        shuffle_captions=args.shuffle_captions,
+        repeats=args.dataset_repeats,
+        use_text_files_as_captions=args.use_text_files_as_captions,
+        seed = args.seed,
+        model_variant=args.model_variant,
+        extra_module=None if args.model_variant != "depth2img" else d2i,
+        mask_prompts=args.mask_prompts,
+        load_mask=args.masked_training,
+    )
+    def collate_fn(examples):
+        #print(examples)
+        #print('test')
+        input_ids = [example["instance_prompt_ids"] for example in examples]
+        tokens = input_ids
+        pixel_values = [example["instance_images"] for example in examples]
+        mask = None
+        if "mask" in examples[0]:
+            mask = [example["mask"] for example in examples]
+        if args.model_variant == 'depth2img':
+            depth = [example["instance_depth_images"] for example in examples]
+        #print('test')
+        # Concat class and instance examples for prior preservation.
+        # We do this to avoid doing two forward passes.
+        if args.with_prior_preservation:
+            input_ids += [example["class_prompt_ids"] for example in examples]
+            pixel_values += [example["class_images"] for example in examples]
+            if "mask" in examples[0]:
+                mask += [example["class_mask"] for example in examples]
+            if args.model_variant == 'depth2img':
+                depth = [example["class_depth_images"] for example in examples]
+        mask_values = None
+        if mask is not None:
+            mask_values = torch.stack(mask)
+            mask_values = mask_values.to(memory_format=torch.contiguous_format).float()
+        if args.model_variant == 'depth2img':
+            depth_values = torch.stack(depth)
+            depth_values = depth_values.to(memory_format=torch.contiguous_format).float()
+        ### no need to do it now when it's loaded by the multiAspectsDataset
+        #if args.with_prior_preservation:
+        #    input_ids += [example["class_prompt_ids"] for example in examples]
+        #    pixel_values += [example["class_images"] for example in examples]
+        #print(pixel_values)
+        #unpack the pixel_values from tensor to list
+        pixel_values = torch.stack(pixel_values)
+        pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float()
+        input_ids = tokenizer.pad(
+            {"input_ids": input_ids},
+            padding="max_length",
+            max_length=tokenizer.model_max_length,
+            return_tensors="pt",\
+            ).input_ids
+        extra_values = None
+        if args.model_variant == 'depth2img':
+            extra_values = depth_values
+        return {
+            "input_ids": input_ids,
+            "pixel_values": pixel_values,
+            "extra_values": extra_values,
+            "mask_values": mask_values,
+            "tokens": tokens
+        }
+    train_dataloader = torch.utils.data.DataLoader(
+        train_dataset, batch_size=args.train_batch_size, shuffle=False, collate_fn=collate_fn, pin_memory=True
+    )
+    #get the length of the dataset
+    train_dataset_length = len(train_dataset)
+    #code to check if latent cache needs to be resaved
+    #check if last_run.json file exists in logging_dir
+    if os.path.exists(logging_dir / "last_run.json"):
+        #if it exists, load it
+        with open(logging_dir / "last_run.json", "r") as f:
+            last_run = json.load(f)
+            last_run_batch_size = last_run["batch_size"]
+            last_run_dataset_length = last_run["dataset_length"]
+            if last_run_batch_size != args.train_batch_size:
+                print(f" {bcolors.WARNING}The batch_size has changed since the last run. Regenerating Latent Cache.{bcolors.ENDC}")
+                args.regenerate_latent_cache = True
+                #save the new batch_size and dataset_length to last_run.json
+            if last_run_dataset_length != train_dataset_length:
+                print(f" {bcolors.WARNING}The dataset length has changed since the last run. Regenerating Latent Cache.{bcolors.ENDC}")
+                args.regenerate_latent_cache = True
+                #save the new batch_size and dataset_length to last_run.json
+        with open(logging_dir / "last_run.json", "w") as f:
+            json.dump({"batch_size": args.train_batch_size, "dataset_length": train_dataset_length}, f)
+    else:
+        #if it doesn't exist, create it
+        last_run = {"batch_size": args.train_batch_size, "dataset_length": train_dataset_length}
+        #create the file
+        with open(logging_dir / "last_run.json", "w") as f:
+            json.dump(last_run, f)
+    weight_dtype = torch.float32
+    if accelerator.mixed_precision == "fp16":
+        print("Using fp16")
+        weight_dtype = torch.float16
+    elif accelerator.mixed_precision == "bf16":
+        print("Using bf16")
+        weight_dtype = torch.bfloat16
+    elif args.mixed_precision == "tf32":
+        torch.backends.cuda.matmul.allow_tf32 = True
+        #torch.set_float32_matmul_precision("medium")
+    # Move text_encode and vae to gpu.
+    # For mixed precision training we cast the text_encoder and vae weights to half-precision
+    # as these models are only used for inference, keeping weights in full precision is not required.
+    vae.to(accelerator.device, dtype=weight_dtype)
+    if args.use_ema == True:
+        ema_unet.to(accelerator.device)
+    if not args.train_text_encoder:
+        text_encoder.to(accelerator.device, dtype=weight_dtype)
+    if args.use_bucketing:
+        wh = set([tuple(x.target_wh) for x in train_dataset.image_train_items])
+    else:
+        wh = set([tuple([args.resolution, args.resolution]) for x in train_dataset.image_paths])
+    full_mask_by_aspect = {shape: vae.encode(torch.zeros(1, 3, shape[1], shape[0]).to(accelerator.device, dtype=weight_dtype)).latent_dist.mean * 0.18215 for shape in wh}
+    cached_dataset = CachedLatentsDataset(batch_size=args.train_batch_size,
+    text_encoder=text_encoder,
+    tokenizer=tokenizer,
+    dtype=weight_dtype,
+    model_variant=args.model_variant,
+    shuffle_per_epoch="False",
+    args = args,)
+    gen_cache = False
+    data_len = len(train_dataloader)
+    latent_cache_dir = Path(args.output_dir, "logs", "latent_cache")
+    #check if latents_cache.pt exists in the output_dir
+    if not os.path.exists(latent_cache_dir):
+        os.makedirs(latent_cache_dir)
+    for i in range(0,data_len-1):
+        if not os.path.exists(os.path.join(latent_cache_dir, f"latents_cache_{i}.pt")):
+            gen_cache = True
+            break
+    if args.regenerate_latent_cache == True:
+            files = os.listdir(latent_cache_dir)
+            gen_cache = True
+            for file in files:
+                os.remove(os.path.join(latent_cache_dir,file))
+    if gen_cache == False :
+        print(f" {bcolors.OKGREEN}Loading Latent Cache from {latent_cache_dir}{bcolors.ENDC}")
+        del vae
+        if not args.train_text_encoder:
+            del text_encoder
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+            torch.cuda.ipc_collect()
+        #load all the cached latents into a single dataset
+        for i in range(0,data_len-1):
+            cached_dataset.add_pt_cache(os.path.join(latent_cache_dir,f"latents_cache_{i}.pt"))
+    if gen_cache == True:
+        #delete all the cached latents if they exist to avoid problems
+        print(f" {bcolors.WARNING}Generating latents cache...{bcolors.ENDC}")
+        train_dataset = LatentsDataset([], [], [], [], [], [])
+        counter = 0
+        ImageFile.LOAD_TRUNCATED_IMAGES = True
+        with torch.no_grad():
+            for batch in tqdm(train_dataloader, desc="Caching latents", bar_format='%s{l_bar}%s%s{bar}%s%s{r_bar}%s'%(bcolors.OKBLUE,bcolors.ENDC, bcolors.OKBLUE, bcolors.ENDC,bcolors.OKBLUE,bcolors.ENDC,)):
+                cached_extra = None
+                cached_mask = None
+                batch["pixel_values"] = batch["pixel_values"].to(accelerator.device, non_blocking=True, dtype=weight_dtype)
+                batch["input_ids"] = batch["input_ids"].to(accelerator.device, non_blocking=True)
+                cached_latent = vae.encode(batch["pixel_values"]).latent_dist
+                if batch["mask_values"] is not None:
+                    cached_mask = functional.resize(batch["mask_values"], size=cached_latent.mean.shape[2:])
+                if batch["mask_values"] is not None and args.model_variant == "inpainting":
+                    batch["mask_values"] = batch["mask_values"].to(accelerator.device, non_blocking=True, dtype=weight_dtype)
+                    cached_extra = vae.encode(batch["pixel_values"] * (1 - batch["mask_values"])).latent_dist
+                if args.model_variant == "depth2img":
+                    batch["extra_values"] = batch["extra_values"].to(accelerator.device, non_blocking=True, dtype=weight_dtype)
+                    cached_extra = functional.resize(batch["extra_values"], size=cached_latent.mean.shape[2:])
+                if args.train_text_encoder:
+                    cached_text_enc = batch["input_ids"]
+                else:
+                    cached_text_enc = text_encoder(batch["input_ids"])[0]
+                train_dataset.add_latent(cached_latent, cached_text_enc, cached_mask, cached_extra, batch["tokens"])
+                del batch
+                del cached_latent
+                del cached_text_enc
+                del cached_mask
+                del cached_extra
+                torch.save(train_dataset, os.path.join(latent_cache_dir,f"latents_cache_{counter}.pt"))
+                cached_dataset.add_pt_cache(os.path.join(latent_cache_dir,f"latents_cache_{counter}.pt"))
+                counter += 1
+                train_dataset = LatentsDataset([], [], [], [], [], [])
+                #if counter % 300 == 0:
+                    #train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=1, collate_fn=lambda x: x, shuffle=False)
+                #    gc.collect()
+                #    torch.cuda.empty_cache()
+                #    accelerator.free_memory()
+        #clear vram after caching latents
+        del vae
+        if not args.train_text_encoder:
+            del text_encoder
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+            torch.cuda.ipc_collect()
+        #load all the cached latents into a single dataset
+    train_dataloader = torch.utils.data.DataLoader(cached_dataset, batch_size=1, collate_fn=lambda x: x, shuffle=False)
+    print(f" {bcolors.OKGREEN}Latents are ready.{bcolors.ENDC}")
+    # Scheduler and math around the number of training steps.
+    overrode_max_train_steps = False
+    num_update_steps_per_epoch = len(train_dataloader)
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        overrode_max_train_steps = True
+    if args.lr_warmup_steps < 1:
+        args.lr_warmup_steps = math.floor(args.lr_warmup_steps * args.max_train_steps / args.gradient_accumulation_steps)
+    lr_scheduler = get_scheduler(
+        args.lr_scheduler,
+        optimizer=optimizer,
+        num_warmup_steps=args.lr_warmup_steps * args.gradient_accumulation_steps,
+        num_training_steps=args.max_train_steps,
+    )
+    if args.train_text_encoder and not args.use_ema:
+        unet, text_encoder, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+            unet, text_encoder, optimizer, train_dataloader, lr_scheduler
+        )
+    elif args.train_text_encoder and args.use_ema:
+        unet, text_encoder, ema_unet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+            unet, text_encoder, ema_unet, optimizer, train_dataloader, lr_scheduler
+        )
+    elif not args.train_text_encoder and args.use_ema:
+        unet, ema_unet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+            unet, ema_unet, optimizer, train_dataloader, lr_scheduler
+        )
+    elif not args.train_text_encoder and not args.use_ema:
+        unet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+            unet, optimizer, train_dataloader, lr_scheduler
+        )
+    if args.with_gan:
+        lr_scheduler_discriminator = get_scheduler(
+            args.lr_scheduler,
+            optimizer=optimizer_discriminator,
+            num_warmup_steps=args.lr_warmup_steps * args.gradient_accumulation_steps,
+            num_training_steps=args.max_train_steps,
+        )
+        discriminator, optimizer_discriminator, lr_scheduler_discriminator = accelerator.prepare(discriminator, optimizer_discriminator, lr_scheduler_discriminator)
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = len(train_dataloader)
+    if overrode_max_train_steps:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        #print(args.max_train_steps, num_update_steps_per_epoch)
+    # Afterwards we recalculate our number of training epochs
+    #print(args.max_train_steps, num_update_steps_per_epoch)
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+    # We need to initialize the trackers we use, and also store our configuration.
+    # The trackers initializes automatically on the main process.
+    if accelerator.is_main_process:
+        accelerator.init_trackers("dreambooth")
+    # Train!
+    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num batches each epoch = {len(train_dataloader)}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+    def mid_train_playground(step):
+        tqdm.write(f"{bcolors.WARNING} Booting up GUI{bcolors.ENDC}")
+        epoch = step // num_update_steps_per_epoch
+        if args.train_text_encoder and args.stop_text_encoder_training == True:
+            text_enc_model = accelerator.unwrap_model(text_encoder,True)
+        elif args.train_text_encoder and args.stop_text_encoder_training > epoch:
+            text_enc_model = accelerator.unwrap_model(text_encoder,True)
+        elif args.train_text_encoder == False:
+            text_enc_model = CLIPTextModel.from_pretrained(args.pretrained_model_name_or_path, subfolder="text_encoder" )
+        elif args.train_text_encoder and args.stop_text_encoder_training <= epoch:
+            if 'frozen_directory' in locals():
+                text_enc_model = CLIPTextModel.from_pretrained(frozen_directory, subfolder="text_encoder")
+            else:
+                text_enc_model = accelerator.unwrap_model(text_encoder,True)
+        scheduler = DPMSolverMultistepScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
+        unwrapped_unet = accelerator.unwrap_model(ema_unet if args.use_ema else unet,True)
+        pipeline = DiffusionPipeline.from_pretrained(
+            args.pretrained_model_name_or_path,
+            unet=unwrapped_unet,
+            text_encoder=text_enc_model,
+            vae=AutoencoderKL.from_pretrained(args.pretrained_vae_name_or_path or args.pretrained_model_name_or_path,subfolder=None if args.pretrained_vae_name_or_path else "vae", safe_serialization=True),
+            safety_checker=None,
+            torch_dtype=weight_dtype,
+            local_files_only=False,
+            requires_safety_checker=False,
+        )
+        pipeline.scheduler = scheduler
+        if is_xformers_available() and args.attention=='xformers':
+            try:
+                vae.enable_xformers_memory_efficient_attention()
+                unet.enable_xformers_memory_efficient_attention()
+            except Exception as e:
+                logger.warning(
+                    "Could not enable memory efficient attention. Make sure xformers is installed"
+                    f" correctly and a GPU is available: {e}"
+                )
+        elif args.attention=='flash_attention':
+            replace_unet_cross_attn_to_flash_attention()
+        pipeline = pipeline.to(accelerator.device)
+        def inference(prompt, negative_prompt, num_samples, height=512, width=512, num_inference_steps=50,seed=-1,guidance_scale=7.5):
+            with torch.autocast("cuda"), torch.inference_mode():
+                if seed != -1:
+                    if g_cuda is None:
+                        g_cuda = torch.Generator(device='cuda')
+                    else:
+                        g_cuda.manual_seed(int(seed))
+                else:
+                    seed = random.randint(0, 100000)
+                    g_cuda = torch.Generator(device='cuda')
+                    g_cuda.manual_seed(seed)
+                    return pipeline(
+                            prompt, height=int(height), width=int(width),
+                            negative_prompt=negative_prompt,
+                            num_images_per_prompt=int(num_samples),
+                            num_inference_steps=int(num_inference_steps), guidance_scale=guidance_scale,
+                            generator=g_cuda).images, seed
+        with gr.Blocks() as demo:
+            with gr.Row():
+                with gr.Column():
+                    prompt = gr.Textbox(label="Prompt", value="photo of zwx dog in a bucket")
+                    negative_prompt = gr.Textbox(label="Negative Prompt", value="")
+                    run = gr.Button(value="Generate")
+                    with gr.Row():
+                        num_samples = gr.Number(label="Number of Samples", value=4)
+                        guidance_scale = gr.Number(label="Guidance Scale", value=7.5)
+                    with gr.Row():
+                        height = gr.Number(label="Height", value=512)
+                        width = gr.Number(label="Width", value=512)
+                    with gr.Row():
+                        num_inference_steps = gr.Slider(label="Steps", value=25)
+                        seed = gr.Number(label="Seed", value=-1)
+                with gr.Column():
+                    gallery = gr.Gallery()
+                    seedDisplay = gr.Number(label="Used Seed:", value=0)
+            run.click(inference, inputs=[prompt, negative_prompt, num_samples, height, width, num_inference_steps,seed, guidance_scale], outputs=[gallery,seedDisplay])
+        demo.launch(share=True,prevent_thread_lock=True)
+        tqdm.write(f"{bcolors.WARNING}Gradio Session is active, Press 'F12' to resume training{bcolors.ENDC}")
+        keyboard.wait('f12')
+        demo.close()
+        del demo
+        del text_enc_model
+        del unwrapped_unet
+        del pipeline
+        return
+    def save_and_sample_weights(step,context='checkpoint',save_model=True):
+        try:
+            #check how many folders are in the output dir
+            #if there are more than 5, delete the oldest one
+            #save the model
+            #save the optimizer
+            #save the lr_scheduler
+            #save the args
+            height = args.sample_height
+            width = args.sample_width
+            batch_prompts = []
+            if args.sample_from_batch > 0:
+                num_samples = args.sample_from_batch if args.sample_from_batch < args.train_batch_size else args.train_batch_size
+                batch_prompts = []
+                tokens = args.batch_tokens
+                if tokens != None:
+                    allPrompts = list(set([tokenizer.decode(p).replace('<|endoftext|>','').replace('<|startoftext|>', '') for p in tokens]))
+                    if len(allPrompts) < num_samples:
+                        num_samples = len(allPrompts)
+                    batch_prompts = random.sample(allPrompts, num_samples)
+            if args.sample_aspect_ratios:
+                #choose random aspect ratio from ASPECTS
+                aspect_ratio = random.choice(ASPECTS)
+                height = aspect_ratio[0]
+                width = aspect_ratio[1]
+            if os.path.exists(args.output_dir):
+                if args.detect_full_drive==True:
+                    folders = os.listdir(args.output_dir)
+                    #check how much space is left on the drive
+                    total, used, free = shutil.disk_usage("/")
+                    if (free // (2**30)) < 4:
+                        #folders.remove("0")
+                        #get the folder with the lowest number
+                        #oldest_folder = min(folder for folder in folders if folder.isdigit())
+                        tqdm.write(f"{bcolors.FAIL}Drive is almost full, Please make some space to continue training.{bcolors.ENDC}")
+                        if args.send_telegram_updates:
+                            try:
+                                send_telegram_message(f"Drive is almost full, Please make some space to continue training.", args.telegram_chat_id, args.telegram_token)
+                            except:
+                                pass
+                        #count time
+                        import time
+                        start_time = time.time()
+                        import platform
+                        while input("Press Enter to continue... if you're on linux we'll wait 5 minutes for you to make space and continue"):
+                            #check if five minutes have passed
+                            #check if os is linux
+                            if 'Linux' in platform.platform():
+                                if time.time() - start_time > 300:
+                                    break
+                        #oldest_folder_path = os.path.join(args.output_dir, oldest_folder)
+                        #shutil.rmtree(oldest_folder_path)
+            # Create the pipeline using using the trained modules and save it.
+            if accelerator.is_main_process:
+                if 'step' in context:
+                    #what is the current epoch
+                    epoch = step // num_update_steps_per_epoch
+                else:
+                    epoch = step
+                if args.train_text_encoder and args.stop_text_encoder_training == True:
+                    text_enc_model = accelerator.unwrap_model(text_encoder,True)
+                elif args.train_text_encoder and args.stop_text_encoder_training > epoch:
+                    text_enc_model = accelerator.unwrap_model(text_encoder,True)
+                elif args.train_text_encoder == False:
+                    text_enc_model = CLIPTextModel.from_pretrained(args.pretrained_model_name_or_path, subfolder="text_encoder" )
+                elif args.train_text_encoder and args.stop_text_encoder_training <= epoch:
+                    if 'frozen_directory' in locals():
+                        text_enc_model = CLIPTextModel.from_pretrained(frozen_directory, subfolder="text_encoder")
+                    else:
+                        text_enc_model = accelerator.unwrap_model(text_encoder,True)
+                #scheduler = DDIMScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", clip_sample=False, set_alpha_to_one=False)
+                #scheduler = EulerDiscreteScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler", prediction_type="v_prediction")
+                scheduler = DPMSolverMultistepScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
+                unwrapped_unet = accelerator.unwrap_model(unet,True)
+                pipeline = DiffusionPipeline.from_pretrained(
+                    args.pretrained_model_name_or_path,
+                    unet=unwrapped_unet,
+                    text_encoder=text_enc_model,
+                    vae=AutoencoderKL.from_pretrained(args.pretrained_vae_name_or_path or args.pretrained_model_name_or_path,subfolder=None if args.pretrained_vae_name_or_path else "vae",),
+                    safety_checker=None,
+                    torch_dtype=weight_dtype,
+                    local_files_only=False,
+                    requires_safety_checker=False,
+                )
+                pipeline.scheduler = scheduler
+                if is_xformers_available() and args.attention=='xformers':
+                    try:
+                        unet.enable_xformers_memory_efficient_attention()
+                    except Exception as e:
+                        logger.warning(
+                            "Could not enable memory efficient attention. Make sure xformers is installed"
+                            f" correctly and a GPU is available: {e}"
+                        )
+                elif args.attention=='flash_attention':
+                    replace_unet_cross_attn_to_flash_attention()
+                if args.run_name:
+                    save_dir = os.path.join(args.output_dir, f"{context}_{step}_{args.run_name}")
+                else:
+                    save_dir = os.path.join(args.output_dir, f"{context}_{step}")
+                if args.flatten_sample_folder:
+                    sample_dir = main_sample_dir
+                else:
+                    sample_dir = os.path.join(main_sample_dir, f"{context}_{step}")
+                #if sample dir path does not exist, create it
+                if args.stop_text_encoder_training == True:
+                    save_dir = frozen_directory
+                if save_model:
+                    pipeline.save_pretrained(save_dir,safe_serialization=True)
+                    if args.with_gan:
+                        discriminator.save_pretrained(os.path.join(save_dir, "discriminator"), safe_serialization=True)
+                    if args.use_ema:
+                        ema_unet.save_pretrained(os.path.join(save_dir, "unet_ema"), safe_serialization=True)
+                    with open(os.path.join(save_dir, "args.json"), "w") as f:
+                        json.dump(args.__dict__, f, indent=2)
+                if args.stop_text_encoder_training == True:
+                    #delete every folder in frozen_directory but the text encoder
+                    for folder in os.listdir(save_dir):
+                        if folder != "text_encoder" and os.path.isdir(os.path.join(save_dir, folder)):
+                            shutil.rmtree(os.path.join(save_dir, folder))
+                imgs = []
+                if args.use_ema and args.sample_from_ema:
+                    pipeline.unet = ema_unet
+                for param in unet.parameters():
+                    param.requires_grad = False
+                if torch.cuda.is_available():
+                    torch.cuda.empty_cache()
+                    torch.cuda.ipc_collect()
+                if args.add_sample_prompt is not None or batch_prompts != [] and args.stop_text_encoder_training != True:
+                    prompts = []
+                    if args.add_sample_prompt is not None:
+                        for prompt in args.add_sample_prompt:
+                            prompts.append(prompt)
+                    if batch_prompts != []:
+                        for prompt in batch_prompts:
+                            prompts.append(prompt)
+                    pipeline = pipeline.to(accelerator.device)
+                    pipeline.set_progress_bar_config(disable=True)
+                    #sample_dir = os.path.join(save_dir, "samples")
+                    #if sample_dir exists, delete it
+                    if os.path.exists(sample_dir):
+                        if not args.flatten_sample_folder:
+                            shutil.rmtree(sample_dir)
+                    os.makedirs(sample_dir, exist_ok=True)
+                    with torch.autocast("cuda"), torch.inference_mode():
+                        if args.send_telegram_updates:
+                            try:
+                                send_telegram_message(f"Generating samples for <b>{step}</b> {context}", args.telegram_chat_id, args.telegram_token)
+                            except:
+                                pass
+                        n_sample = args.n_save_sample
+                        if args.save_sample_controlled_seed:
+                            n_sample += len(args.save_sample_controlled_seed)
+                        progress_bar_sample = tqdm(total=len(prompts)*n_sample,desc="Generating samples")
+                        for samplePrompt in prompts:
+                            sampleIndex = prompts.index(samplePrompt)
+                            #convert sampleIndex to number in words
+                            # Data to be written
+                            sampleProperties = {
+                                "samplePrompt" : samplePrompt
+                            }
+                            # Serializing json
+                            json_object = json.dumps(sampleProperties, indent=4)
+                            if args.flatten_sample_folder:
+                                sampleName = f"{context}_{step}_prompt_{sampleIndex+1}"
+                            else:
+                                sampleName = f"prompt_{sampleIndex+1}"
+                            if not args.flatten_sample_folder:
+                                os.makedirs(os.path.join(sample_dir,sampleName), exist_ok=True)
+                            if args.model_variant == 'inpainting':
+                                conditioning_image = torch.zeros(1, 3, height, width)
+                                mask = torch.ones(1, 1, height, width)
+                            if args.model_variant == 'depth2img':
+                                #pil new white image
+                                test_image = Image.new('RGB', (width, height), (255, 255, 255))
+                                depth_image = Image.new('RGB', (width, height), (255, 255, 255))
+                                depth = np.array(depth_image.convert("L"))
+                                depth = depth.astype(np.float32) / 255.0
+                                depth = depth[None, None]
+                                depth = torch.from_numpy(depth)
+                            for i in range(n_sample):
+                                #check if the sample is controlled by a seed
+                                if i < args.n_save_sample:
+                                    if args.model_variant == 'inpainting':
+                                        images = pipeline(samplePrompt, conditioning_image, mask, height=height,width=width, guidance_scale=args.save_guidance_scale, num_inference_steps=args.save_infer_steps).images
+                                    if args.model_variant == 'depth2img':
+                                        images = pipeline(samplePrompt,image=test_image, guidance_scale=args.save_guidance_scale, num_inference_steps=args.save_infer_steps,strength=1.0).images
+                                    elif args.model_variant == 'base':
+                                        images = pipeline(samplePrompt,height=height,width=width, guidance_scale=args.save_guidance_scale, num_inference_steps=args.save_infer_steps).images
+                                    if not args.flatten_sample_folder:
+                                        images[0].save(os.path.join(sample_dir,sampleName, f"{sampleName}_{i}.png"))
+                                    else:
+                                        images[0].save(os.path.join(sample_dir, f"{sampleName}_{i}.png"))
+                                else:
+                                    seed = args.save_sample_controlled_seed[i - args.n_save_sample]
+                                    generator = torch.Generator("cuda").manual_seed(seed)
+                                    if args.model_variant == 'inpainting':
+                                        images = pipeline(samplePrompt,conditioning_image, mask,height=height,width=width, guidance_scale=args.save_guidance_scale, num_inference_steps=args.save_infer_steps, generator=generator).images
+                                    if args.model_variant == 'depth2img':
+                                        images = pipeline(samplePrompt,image=test_image, guidance_scale=args.save_guidance_scale, num_inference_steps=args.save_infer_steps,generator=generator,strength=1.0).images
+                                    elif args.model_variant == 'base':
+                                        images = pipeline(samplePrompt,height=height,width=width, guidance_scale=args.save_guidance_scale, num_inference_steps=args.save_infer_steps, generator=generator).images
+                                    if not args.flatten_sample_folder:
+                                        images[0].save(os.path.join(sample_dir,sampleName, f"{sampleName}_controlled_seed_{str(seed)}.png"))
+                                    else:
+                                        images[0].save(os.path.join(sample_dir, f"{sampleName}_controlled_seed_{str(seed)}.png"))
+                                progress_bar_sample.update(1)
+                            if args.send_telegram_updates:
+                                imgs = []
+                                #get all the images from the sample folder
+                                if not args.flatten_sample_folder:
+                                    dir = os.listdir(os.path.join(sample_dir,sampleName))
+                                else:
+                                    dir = sample_dir
+                                for file in dir:
+                                    if file.endswith(".png"):
+                                        #open the image with pil
+                                        img = Image.open(os.path.join(sample_dir,sampleName,file))
+                                        imgs.append(img)
+                                try:
+                                    send_media_group(args.telegram_chat_id,args.telegram_token,imgs, caption=f"Samples for the <b>{step}</b> {context} using the prompt:\n\n<b>{samplePrompt}</b>")
+                                except:
+                                    pass
+                    del pipeline
+                    del unwrapped_unet
+                    for param in unet.parameters():
+                        param.requires_grad = True
+                    if torch.cuda.is_available():
+                        torch.cuda.empty_cache()
+                        torch.cuda.ipc_collect()
+                if save_model == True:
+                    tqdm.write(f"{bcolors.OKGREEN}Weights saved to {save_dir}{bcolors.ENDC}")
+                elif save_model == False and len(imgs) > 0:
+                    del imgs
+                    tqdm.write(f"{bcolors.OKGREEN}Samples saved to {sample_dir}{bcolors.ENDC}")
+        except Exception as e:
+            tqdm.write(e)
+            tqdm.write(f"{bcolors.FAIL} Error occured during sampling, skipping.{bcolors.ENDC}")
+            pass
+    @torch.no_grad()
+    def update_ema(ema_model, model):
+        ema_step = ema_model.config["step"]
+        decay = min((ema_step + 1) / (ema_step + 10), 0.9999)
+        ema_model.config["step"] += 1
+        for (s_param, param) in zip(ema_model.parameters(), model.parameters()):
+            if param.requires_grad:
+                s_param.add_((1 - decay) * (param - s_param))
+            else:
+                s_param.copy_(param)
+    # Only show the progress bar once on each machine.
+    progress_bar = tqdm(range(args.max_train_steps),bar_format='%s{l_bar}%s%s{bar}%s%s{r_bar}%s'%(bcolors.OKBLUE,bcolors.ENDC, bcolors.OKBLUE, bcolors.ENDC,bcolors.OKBLUE,bcolors.ENDC,), disable=not accelerator.is_local_main_process)
+    progress_bar_inter_epoch = tqdm(range(num_update_steps_per_epoch),bar_format='%s{l_bar}%s%s{bar}%s%s{r_bar}%s'%(bcolors.OKBLUE,bcolors.ENDC, bcolors.OKGREEN, bcolors.ENDC,bcolors.OKBLUE,bcolors.ENDC,), disable=not accelerator.is_local_main_process)
+    progress_bar_e = tqdm(range(args.num_train_epochs),bar_format='%s{l_bar}%s%s{bar}%s%s{r_bar}%s'%(bcolors.OKBLUE,bcolors.ENDC, bcolors.OKGREEN, bcolors.ENDC,bcolors.OKBLUE,bcolors.ENDC,), disable=not accelerator.is_local_main_process)
+    progress_bar.set_description("Overall Steps")
+    progress_bar_inter_epoch.set_description("Steps To Epoch")
+    progress_bar_e.set_description("Overall Epochs")
+    global_step = 0
+    loss_avg = AverageMeter("loss_avg", max_eta=0.999)
+    gan_loss_avg = AverageMeter("gan_loss_avg", max_eta=0.999)
+    text_enc_context = nullcontext() if args.train_text_encoder else torch.no_grad()
+    if args.send_telegram_updates:
+        try:
+            send_telegram_message(f"Starting training with the following settings:\n\n{format_dict(args.__dict__)}", args.telegram_chat_id, args.telegram_token)
+        except:
+            pass
+    try:
+        tqdm.write(f"{bcolors.OKBLUE}Starting Training!{bcolors.ENDC}")
+        try:
+            def toggle_gui(event=None):
+                if keyboard.is_pressed("ctrl") and keyboard.is_pressed("shift") and keyboard.is_pressed("g"):
+                    tqdm.write(f"{bcolors.WARNING}GUI will boot as soon as the current step is done.{bcolors.ENDC}")
+                    nonlocal mid_generation
+                    if mid_generation == True:
+                        mid_generation = False
+                        tqdm.write(f"{bcolors.WARNING}Cancelled GUI.{bcolors.ENDC}")
+                    else:
+                        mid_generation = True
+            def toggle_checkpoint(event=None):
+                if keyboard.is_pressed("ctrl") and keyboard.is_pressed("shift") and keyboard.is_pressed("s") and not keyboard.is_pressed("alt"):
+                    tqdm.write(f"{bcolors.WARNING}Saving the model as soon as this epoch is done.{bcolors.ENDC}")
+                    nonlocal mid_checkpoint
+                    if mid_checkpoint == True:
+                        mid_checkpoint = False
+                        tqdm.write(f"{bcolors.WARNING}Cancelled Checkpointing.{bcolors.ENDC}")
+                    else:
+                        mid_checkpoint = True
+            def toggle_sample(event=None):
+                if keyboard.is_pressed("ctrl") and keyboard.is_pressed("shift") and keyboard.is_pressed("p") and not keyboard.is_pressed("alt"):
+                    tqdm.write(f"{bcolors.WARNING}Sampling will begin as soon as this epoch is done.{bcolors.ENDC}")
+                    nonlocal mid_sample
+                    if mid_sample == True:
+                        mid_sample = False
+                        tqdm.write(f"{bcolors.WARNING}Cancelled Sampling.{bcolors.ENDC}")
+                    else:
+                        mid_sample = True
+            def toggle_checkpoint_step(event=None):
+                if keyboard.is_pressed("ctrl") and keyboard.is_pressed("shift") and keyboard.is_pressed("alt") and keyboard.is_pressed("s"):
+                    tqdm.write(f"{bcolors.WARNING}Saving the model as soon as this step is done.{bcolors.ENDC}")
+                    nonlocal mid_checkpoint_step
+                    if mid_checkpoint_step == True:
+                        mid_checkpoint_step = False
+                        tqdm.write(f"{bcolors.WARNING}Cancelled Checkpointing.{bcolors.ENDC}")
+                    else:
+                        mid_checkpoint_step = True
+            def toggle_sample_step(event=None):
+                if keyboard.is_pressed("ctrl") and keyboard.is_pressed("shift") and keyboard.is_pressed("alt") and keyboard.is_pressed("p"):
+                    tqdm.write(f"{bcolors.WARNING}Sampling will begin as soon as this step is done.{bcolors.ENDC}")
+                    nonlocal mid_sample_step
+                    if mid_sample_step == True:
+                        mid_sample_step = False
+                        tqdm.write(f"{bcolors.WARNING}Cancelled Sampling.{bcolors.ENDC}")
+                    else:
+                        mid_sample_step = True
+            def toggle_quit_and_save_epoch(event=None):
+                if keyboard.is_pressed("ctrl") and keyboard.is_pressed("shift") and keyboard.is_pressed("q") and not keyboard.is_pressed("alt"):
+                    tqdm.write(f"{bcolors.WARNING}Quitting and saving the model as soon as this epoch is done.{bcolors.ENDC}")
+                    nonlocal mid_quit
+                    if mid_quit == True:
+                        mid_quit = False
+                        tqdm.write(f"{bcolors.WARNING}Cancelled Quitting.{bcolors.ENDC}")
+                    else:
+                        mid_quit = True
+            def toggle_quit_and_save_step(event=None):
+                if keyboard.is_pressed("ctrl") and keyboard.is_pressed("shift") and keyboard.is_pressed("alt") and keyboard.is_pressed("q"):
+                    tqdm.write(f"{bcolors.WARNING}Quitting and saving the model as soon as this step is done.{bcolors.ENDC}")
+                    nonlocal mid_quit_step
+                    if mid_quit_step == True:
+                        mid_quit_step = False
+                        tqdm.write(f"{bcolors.WARNING}Cancelled Quitting.{bcolors.ENDC}")
+                    else:
+                        mid_quit_step = True
+            def help(event=None):
+                if keyboard.is_pressed("ctrl") and keyboard.is_pressed("h"):
+                    print_instructions()
+            keyboard.on_press_key("g", toggle_gui)
+            keyboard.on_press_key("s", toggle_checkpoint)
+            keyboard.on_press_key("p", toggle_sample)
+            keyboard.on_press_key("s", toggle_checkpoint_step)
+            keyboard.on_press_key("p", toggle_sample_step)
+            keyboard.on_press_key("q", toggle_quit_and_save_epoch)
+            keyboard.on_press_key("q", toggle_quit_and_save_step)
+            keyboard.on_press_key("h", help)
+            print_instructions()
+        except Exception as e:
+            pass
+        mid_generation = False
+        mid_checkpoint = False
+        mid_sample = False
+        mid_checkpoint_step = False
+        mid_sample_step = False
+        mid_quit = False
+        mid_quit_step = False
+        #lambda set mid_generation to true
+        if args.run_name:
+            frozen_directory = os.path.join(args.output_dir, f"frozen_text_encoder_{args.run_name}")
+        else:
+            frozen_directory = os.path.join(args.output_dir, "frozen_text_encoder")
+        unet_stats = {}
+        discriminator_stats = {}
+        os.makedirs(main_sample_dir, exist_ok=True)
+        with open(os.path.join(main_sample_dir, "args.json"), "w") as f:
+            json.dump(args.__dict__, f, indent=2)
+        if args.with_gan:
+            with open(os.path.join(main_sample_dir, "discriminator_config.json"), "w") as f:
+                json.dump(discriminator.config, f, indent=2)
+        for epoch in range(args.num_train_epochs):
+            #every 10 epochs print instructions
+            unet.train()
+            if args.train_text_encoder:
+                text_encoder.train()
+            #save initial weights
+            if args.sample_on_training_start==True and epoch==0:
+                save_and_sample_weights(epoch,'start',save_model=False)
+            if args.train_text_encoder and args.stop_text_encoder_training == epoch:
+                args.stop_text_encoder_training = True
+                if accelerator.is_main_process:
+                    tqdm.write(f"{bcolors.WARNING} Stopping text encoder training{bcolors.ENDC}")
+                    current_percentage = (epoch/args.num_train_epochs)*100
+                    #round to the nearest whole number
+                    current_percentage = round(current_percentage,0)
+                    try:
+                        send_telegram_message(f"Text encoder training stopped at epoch {epoch} which is {current_percentage}% of training. Freezing weights and saving.", args.telegram_chat_id, args.telegram_token)
+                    except:
+                        pass
+                    if os.path.exists(frozen_directory):
+                        #delete the folder if it already exists
+                        shutil.rmtree(frozen_directory)
+                    os.mkdir(frozen_directory)
+                    save_and_sample_weights(epoch,'epoch')
+                    args.stop_text_encoder_training = epoch
+            progress_bar_inter_epoch.reset(total=num_update_steps_per_epoch)
+            for step, batch in enumerate(train_dataloader):
+                with accelerator.accumulate(unet):
+                    # Convert images to latent space
+                    with torch.no_grad():
+                        latent_dist = batch[0][0]
+                        latents = latent_dist.sample() * 0.18215
+                        if args.model_variant == 'inpainting':
+                            mask = batch[0][2]
+                            mask_mean = batch[0][3]
+                            conditioning_latent_dist = batch[0][4]
+                            conditioning_latents = conditioning_latent_dist.sample() * 0.18215
+                        if args.model_variant == 'depth2img':
+                            depth = batch[0][4]
+                    if args.sample_from_batch > 0:
+                        args.batch_tokens = batch[0][5]
+                    # Sample noise that we'll add to the latents
+                    # and some extra bits to make it so that the model learns to change the zero-frequency of the component freely
+                    # https://www.crosslabs.org/blog/diffusion-with-offset-noise
+                    if (args.with_offset_noise == True):
+                        noise = torch.randn_like(latents) + (args.offset_noise_weight * torch.randn(latents.shape[0], latents.shape[1], 1, 1).to(accelerator.device))
+                    else:
+                        noise = torch.randn_like(latents)
+                    bsz = latents.shape[0]
+                    # Sample a random timestep for each image
+                    timesteps = torch.randint(0, int(noise_scheduler.config.num_train_timesteps * args.max_denoising_strength), (bsz,), device=latents.device)
+                    timesteps = timesteps.long()
+                    # Add noise to the latents according to the noise magnitude at each timestep
+                    # (this is the forward diffusion process)
+                    noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
+                    # Get the text embedding for conditioning
+                    with text_enc_context:
+                        if args.train_text_encoder:
+                            if args.clip_penultimate == True:
+                                encoder_hidden_states = text_encoder(batch[0][1],output_hidden_states=True)
+                                encoder_hidden_states = text_encoder.text_model.final_layer_norm(encoder_hidden_states['hidden_states'][-2])
+                            else:
+                                encoder_hidden_states = text_encoder(batch[0][1])[0]
+                        else:
+                            encoder_hidden_states = batch[0][1]
+                    # Predict the noise residual
+                    mask=None
+                    if args.model_variant == 'inpainting':
+                        if mask is not None and random.uniform(0, 1) < args.unmasked_probability:
+                            # for some steps, predict the unmasked image
+                            conditioning_latents = torch.stack([full_mask_by_aspect[tuple([latents.shape[3]*8, latents.shape[2]*8])].squeeze()] * bsz)
+                            mask = torch.ones(bsz, 1, latents.shape[2], latents.shape[3]).to(accelerator.device, dtype=weight_dtype)
+                        noisy_inpaint_latents = torch.concat([noisy_latents, mask, conditioning_latents], 1)
+                        model_pred = unet(noisy_inpaint_latents, timesteps, encoder_hidden_states).sample
+                    elif args.model_variant == 'depth2img':
+                        noisy_depth_latents = torch.cat([noisy_latents, depth], dim=1)
+                        model_pred = unet(noisy_depth_latents, timesteps, encoder_hidden_states, depth).sample
+                    elif args.model_variant == "base":
+                        model_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample
+                    # Get the target for loss depending on the prediction type
+                    if noise_scheduler.config.prediction_type == "epsilon":
+                        target = noise
+                    elif noise_scheduler.config.prediction_type == "v_prediction":
+                        target = noise_scheduler.get_velocity(latents, noise, timesteps)
+                    else:
+                        raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
+                    # GAN stuff
+                    # Input: noisy_latents
+                    # True output: target
+                    # Fake output: model_pred
+                    if args.with_gan:
+                        # Turn on learning for the discriminator, and do an optimization step
+                        for param in discriminator.parameters():
+                            param.requires_grad = True
+                        pred_fake = discriminator(torch.cat((noisy_latents, model_pred), 1).detach(), encoder_hidden_states)
+                        pred_real = discriminator(torch.cat((noisy_latents, target), 1), encoder_hidden_states)
+                        discriminator_loss = F.mse_loss(pred_fake, torch.zeros_like(pred_fake), reduction="mean") + F.mse_loss(pred_real, torch.ones_like(pred_real), reduction="mean")
+                        if discriminator_loss.isnan():
+                            tqdm.write(f"{bcolors.WARNING}Discriminator loss is NAN, skipping GAN update.{bcolors.ENDC}")
+                        else:
+                            accelerator.backward(discriminator_loss)
+                            if accelerator.sync_gradients:
+                                accelerator.clip_grad_norm_(discriminator.parameters(), args.max_grad_norm)
+                            optimizer_discriminator.step()
+                            lr_scheduler_discriminator.step()
+                            # Hack to fix NaNs caused by GAN training
+                            for name, p in discriminator.named_parameters():
+                                if p.isnan().any():
+                                    fix_nans_(p, name, discriminator_stats[name])
+                                else:
+                                    (std, mean) = torch.std_mean(p)
+                                    discriminator_stats[name] = (std.item(), mean.item())
+                                    del std, mean
+                            optimizer_discriminator.zero_grad()
+                        del pred_real, pred_fake, discriminator_loss
+                        # Turn off learning for the discriminator for the generator optimization step
+                        for param in discriminator.parameters():
+                            param.requires_grad = False
+                    if args.with_prior_preservation:
+                        # Chunk the noise and noise_pred into two parts and compute the loss on each part separately.
+                        """
+                        noise_pred, noise_pred_prior = torch.chunk(noise_pred, 2, dim=0)
+                        noise, noise_prior = torch.chunk(noise, 2, dim=0)
+                        # Compute instance loss
+                        loss = F.mse_loss(noise_pred.float(), noise.float(), reduction="none").mean([1, 2, 3]).mean()
+                        # Compute prior loss
+                        prior_loss = F.mse_loss(noise_pred_prior.float(), noise_prior.float(), reduction="mean")
+                        # Add the prior loss to the instance loss.
+                        loss = loss + args.prior_loss_weight * prior_loss
+                        """
+                        # Chunk the noise and model_pred into two parts and compute the loss on each part separately.
+                        model_pred, model_pred_prior = torch.chunk(model_pred, 2, dim=0)
+                        target, target_prior = torch.chunk(target, 2, dim=0)
+                        if mask is not None and args.model_variant != "inpainting":
+                            loss = masked_mse_loss(model_pred.float(), target.float(), mask, reduction="none").mean([1, 2, 3]).mean()
+                            prior_loss = masked_mse_loss(model_pred_prior.float(), target_prior.float(), mask, reduction="mean")
+                        else:
+                            loss = F.mse_loss(model_pred.float(), target.float(), reduction="none").mean([1, 2, 3]).mean()
+                            prior_loss = F.mse_loss(model_pred_prior.float(), target_prior.float(), reduction="mean")
+                        # Add the prior loss to the instance loss.
+                        loss = loss + args.prior_loss_weight * prior_loss
+                        if mask is not None and args.normalize_masked_area_loss:
+                            loss = loss / mask_mean
+                    else:
+                        if mask is not None and args.model_variant != "inpainting":
+                            loss = masked_mse_loss(model_pred.float(), target.float(), mask, reduction="none").mean([1, 2, 3])
+                            loss = loss.mean()
+                        else:
+                            loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
+                        if mask is not None and args.normalize_masked_area_loss:
+                            loss = loss / mask_mean
+                    base_loss = loss
+                    if args.with_gan:
+                        # Add loss from the GAN
+                        pred_fake = discriminator(torch.cat((noisy_latents, model_pred), 1), encoder_hidden_states)
+                        gan_loss = F.mse_loss(pred_fake, torch.ones_like(pred_fake), reduction="mean")
+                        if gan_loss.isnan():
+                            tqdm.write(f"{bcolors.WARNING}GAN loss is NAN, skipping GAN loss.{bcolors.ENDC}")
+                        else:
+                            gan_weight = args.gan_weight
+                            if args.gan_warmup and global_step < args.gan_warmup:
+                                gan_weight *= global_step / args.gan_warmup
+                            loss += gan_weight * gan_loss
+                        del pred_fake
+                    accelerator.backward(loss)
+                    if accelerator.sync_gradients:
+                        params_to_clip = (
+                            itertools.chain(unet.parameters(), text_encoder.parameters())
+                            if args.train_text_encoder
+                            else unet.parameters()
+                        )
+                        accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
+                    optimizer.step()
+                    lr_scheduler.step()
+                    # Hack to fix NaNs caused by GAN training
+                    for name, p in unet.named_parameters():
+                        if p.isnan().any():
+                            fix_nans_(p, name, unet_stats[name])
+                        else:
+                            (std, mean) = torch.std_mean(p)
+                            unet_stats[name] = (std.item(), mean.item())
+                            del std, mean
+                    optimizer.zero_grad()
+                    loss_avg.update(base_loss.detach_())
+                    if args.with_gan and not gan_loss.isnan():
+                        gan_loss_avg.update(gan_loss.detach_())
+                    if args.use_ema == True:
+                        update_ema(ema_unet, unet)
+                    del loss, model_pred
+                    if args.with_prior_preservation:
+                        del model_pred_prior
+                logs = {"loss": loss_avg.avg.item(), "lr": lr_scheduler.get_last_lr()[0]}
+                if args.with_gan:
+                    logs["gan_loss"] = gan_loss_avg.avg.item()
+                progress_bar.set_postfix(**logs)
+                if not global_step % args.log_interval:
+                    accelerator.log(logs, step=global_step)
+                if global_step > 0 and not global_step % args.sample_step_interval:
+                    save_and_sample_weights(global_step,'step',save_model=False)
+                progress_bar.update(1)
+                progress_bar_inter_epoch.update(1)
+                progress_bar_e.refresh()
+                global_step += 1
+                if mid_quit_step==True:
+                    accelerator.wait_for_everyone()
+                    save_and_sample_weights(global_step,'quit_step')
+                    quit()
+                if mid_generation==True:
+                    mid_train_playground(global_step)
+                    mid_generation=False
+                if mid_checkpoint_step == True:
+                    save_and_sample_weights(global_step,'step',save_model=True)
+                    mid_checkpoint_step=False
+                    mid_sample_step=False
+                elif mid_sample_step == True:
+                    save_and_sample_weights(global_step,'step',save_model=False)
+                    mid_sample_step=False
+                if global_step >= args.max_train_steps:
+                    break
+            progress_bar_e.update(1)
+            if mid_quit==True:
+                accelerator.wait_for_everyone()
+                save_and_sample_weights(epoch,'quit_epoch')
+                quit()
+            if epoch == args.num_train_epochs - 1:
+                save_and_sample_weights(epoch,'epoch',True)
+            elif args.save_every_n_epoch and (epoch + 1) % args.save_every_n_epoch == 0:
+                save_and_sample_weights(epoch,'epoch',True)
+            elif mid_checkpoint==True:
+                save_and_sample_weights(epoch,'epoch',True)
+                mid_checkpoint=False
+                mid_sample=False
+            elif mid_sample==True:
+                save_and_sample_weights(epoch,'epoch',False)
+                mid_sample=False
+            accelerator.wait_for_everyone()
+    except Exception:
+        try:
+            send_telegram_message("Something went wrong while training! :(", args.telegram_chat_id, args.telegram_token)
+            #save_and_sample_weights(global_step,'checkpoint')
+            send_telegram_message(f"Saved checkpoint {global_step} on exit", args.telegram_chat_id, args.telegram_token)
+        except Exception:
+            pass
+        raise
+    except KeyboardInterrupt:
+        send_telegram_message("Training stopped", args.telegram_chat_id, args.telegram_token)
+    try:
+        send_telegram_message("Training finished!", args.telegram_chat_id, args.telegram_token)
+    except:
+        pass
+    accelerator.end_training()
+if __name__ == "__main__":
+    main()

StableTuner_RunPod_Fix/trainer_util.py ADDED Viewed

	@@ -0,0 +1,435 @@

+import gradio as gr
+import json
+import math
+from pathlib import Path
+from typing import Optional
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from accelerate.logging import get_logger
+from accelerate.utils import set_seed
+from diffusers import AutoencoderKL, DDIMScheduler, DDPMScheduler, DiffusionPipeline, UNet2DConditionModel,DiffusionPipeline, DPMSolverMultistepScheduler,EulerDiscreteScheduler
+from diffusers.optimization import get_scheduler
+from huggingface_hub import HfFolder, Repository, whoami
+from torchvision import transforms
+from tqdm.auto import tqdm
+from typing import Dict, List, Generator, Tuple
+from PIL import Image, ImageFile
+from collections.abc import Iterable
+from trainer_util import *
+from dataloaders_util import *
+# FlashAttention based on https://github.com/lucidrains/memory-efficient-attention-pytorch/blob/main
+# /memory_efficient_attention_pytorch/flash_attention.py LICENSE MIT
+# https://github.com/lucidrains/memory-efficient-attention-pytorch/blob/main/LICENSE constants
+EPSILON = 1e-6
+class bcolors:
+    HEADER = '\033[95m'
+    OKBLUE = '\033[94m'
+    OKCYAN = '\033[96m'
+    OKGREEN = '\033[92m'
+    WARNING = '\033[93m'
+    FAIL = '\033[91m'
+    ENDC = '\033[0m'
+    BOLD = '\033[1m'
+    UNDERLINE = '\033[4m'
+# helper functions
+def print_instructions():
+            tqdm.write(f"{bcolors.WARNING}Use 'CTRL+SHIFT+G' to open up a GUI to play around with the model (will pause training){bcolors.ENDC}")
+            tqdm.write(f"{bcolors.WARNING}Use 'CTRL+SHIFT+S' to save a checkpoint of the current epoch{bcolors.ENDC}")
+            tqdm.write(f"{bcolors.WARNING}Use 'CTRL+SHIFT+P' to generate samples for current epoch{bcolors.ENDC}")
+            tqdm.write(f"{bcolors.WARNING}Use 'CTRL+SHIFT+Q' to save and quit after the current epoch{bcolors.ENDC}")
+            tqdm.write(f"{bcolors.WARNING}Use 'CTRL+SHIFT+ALT+S' to save a checkpoint of the current step{bcolors.ENDC}")
+            tqdm.write(f"{bcolors.WARNING}Use 'CTRL+SHIFT+ALT+P' to generate samples for current step{bcolors.ENDC}")
+            tqdm.write(f"{bcolors.WARNING}Use 'CTRL+SHIFT+ALT+Q' to save and quit after the current step{bcolors.ENDC}")
+            tqdm.write('')
+            tqdm.write(f"{bcolors.WARNING}Use 'CTRL+H' to print this message again.{bcolors.ENDC}")
+def get_full_repo_name(model_id: str, organization: Optional[str] = None, token: Optional[str] = None):
+    if token is None:
+        token = HfFolder.get_token()
+    if organization is None:
+        username = whoami(token)["name"]
+        return f"{username}/{model_id}"
+    else:
+        return f"{organization}/{model_id}"
+#function to format a dictionary into a telegram message
+def format_dict(d):
+    message = ""
+    for key, value in d.items():
+        #filter keys that have the word "token" in them
+        if "token" in key and "tokenizer" not in key:
+            value = "TOKEN"
+        if 'id' in key:
+            value = "ID"
+        #if value is a dictionary, format it recursively
+        if isinstance(value, dict):
+            for k, v in value.items():
+                message += f"\n- {k}:  <b>{v}</b> \n"
+        elif isinstance(value, list):
+            #each value is a new line in the message
+            message += f"- {key}:\n\n"
+            for v in value:
+                    message += f"  <b>{v}</b>\n\n"
+        #if value is a list, format it as a list
+        else:
+            message += f"- {key}:  <b>{value}</b>\n"
+    return message
+def send_telegram_message(message, chat_id, token):
+    url = f"https://api.telegram.org/bot{token}/sendMessage?chat_id={chat_id}&text={message}&parse_mode=html&disable_notification=True"
+    import requests
+    req = requests.get(url)
+    if req.status_code != 200:
+        raise ValueError(f"Telegram request failed with status code {req.status_code}")
+def send_media_group(chat_id,telegram_token, images, caption=None, reply_to_message_id=None):
+        """
+        Use this method to send an album of photos. On success, an array of Messages that were sent is returned.
+        :param chat_id: chat id
+        :param images: list of PIL images to send
+        :param caption: caption of image
+        :param reply_to_message_id: If the message is a reply, ID of the original message
+        :return: response with the sent message
+        """
+        SEND_MEDIA_GROUP = f'https://api.telegram.org/bot{telegram_token}/sendMediaGroup'
+        from io import BytesIO
+        import requests
+        files = {}
+        media = []
+        for i, img in enumerate(images):
+            with BytesIO() as output:
+                img.save(output, format='PNG')
+                output.seek(0)
+                name = f'photo{i}'
+                files[name] = output.read()
+                # a list of InputMediaPhoto. attach refers to the name of the file in the files dict
+                media.append(dict(type='photo', media=f'attach://{name}'))
+        media[0]['caption'] = caption
+        media[0]['parse_mode'] = 'HTML'
+        return requests.post(SEND_MEDIA_GROUP, data={'chat_id': chat_id, 'media': json.dumps(media),'disable_notification':True, 'reply_to_message_id': reply_to_message_id }, files=files)
+class AverageMeter:
+    def __init__(self, name=None, max_eta=None):
+        self.name = name
+        self.max_eta = max_eta
+        self.reset()
+    def reset(self):
+        self.count = self.avg = 0
+    @torch.no_grad()
+    def update(self, val, n=1):
+        eta = self.count / (self.count + n)
+        if self.max_eta:
+            eta = min(eta, self.max_eta ** n)
+        self.avg += (1 - eta) * (val - self.avg)
+        self.count += n
+def exists(val):
+    return val is not None
+def default(val, d):
+    return val if exists(val) else d
+def masked_mse_loss(predicted, target, mask, reduction="none"):
+    masked_predicted = predicted * mask
+    masked_target = target * mask
+    return F.mse_loss(masked_predicted, masked_target, reduction=reduction)
+# flash attention forwards and backwards
+# https://arxiv.org/abs/2205.14135
+class FlashAttentionFunction(torch.autograd.function.Function):
+    @staticmethod
+    @torch.no_grad()
+    def forward(ctx, q, k, v, mask, causal, q_bucket_size, k_bucket_size):
+        """ Algorithm 2 in the paper """
+        device = q.device
+        dtype = q.dtype
+        max_neg_value = -torch.finfo(q.dtype).max
+        qk_len_diff = max(k.shape[-2] - q.shape[-2], 0)
+        o = torch.zeros_like(q)
+        all_row_sums = torch.zeros(
+            (*q.shape[:-1], 1), dtype=dtype, device=device)
+        all_row_maxes = torch.full(
+            (*q.shape[:-1], 1), max_neg_value, dtype=dtype, device=device)
+        scale = (q.shape[-1] ** -0.5)
+        if not exists(mask):
+            mask = (None,) * math.ceil(q.shape[-2] / q_bucket_size)
+        else:
+            mask = rearrange(mask, 'b n -> b 1 1 n')
+            mask = mask.split(q_bucket_size, dim=-1)
+        row_splits = zip(
+            q.split(q_bucket_size, dim=-2),
+            o.split(q_bucket_size, dim=-2),
+            mask,
+            all_row_sums.split(q_bucket_size, dim=-2),
+            all_row_maxes.split(q_bucket_size, dim=-2),
+        )
+        for ind, (qc, oc, row_mask, row_sums, row_maxes) in enumerate(row_splits):
+            q_start_index = ind * q_bucket_size - qk_len_diff
+            col_splits = zip(
+                k.split(k_bucket_size, dim=-2),
+                v.split(k_bucket_size, dim=-2),
+            )
+            for k_ind, (kc, vc) in enumerate(col_splits):
+                k_start_index = k_ind * k_bucket_size
+                attn_weights = einsum(
+                    '... i d, ... j d -> ... i j', qc, kc) * scale
+                if exists(row_mask):
+                    attn_weights.masked_fill_(~row_mask, max_neg_value)
+                if causal and q_start_index < (k_start_index + k_bucket_size - 1):
+                    causal_mask = torch.ones((qc.shape[-2], kc.shape[-2]), dtype=torch.bool,
+                                             device=device).triu(q_start_index - k_start_index + 1)
+                    attn_weights.masked_fill_(causal_mask, max_neg_value)
+                block_row_maxes = attn_weights.amax(dim=-1, keepdims=True)
+                attn_weights -= block_row_maxes
+                exp_weights = torch.exp(attn_weights)
+                if exists(row_mask):
+                    exp_weights.masked_fill_(~row_mask, 0.)
+                block_row_sums = exp_weights.sum(
+                    dim=-1, keepdims=True).clamp(min=EPSILON)
+                new_row_maxes = torch.maximum(block_row_maxes, row_maxes)
+                exp_values = einsum(
+                    '... i j, ... j d -> ... i d', exp_weights, vc)
+                exp_row_max_diff = torch.exp(row_maxes - new_row_maxes)
+                exp_block_row_max_diff = torch.exp(
+                    block_row_maxes - new_row_maxes)
+                new_row_sums = exp_row_max_diff * row_sums + \
+                               exp_block_row_max_diff * block_row_sums
+                oc.mul_((row_sums / new_row_sums) * exp_row_max_diff).add_(
+                    (exp_block_row_max_diff / new_row_sums) * exp_values)
+                row_maxes.copy_(new_row_maxes)
+                row_sums.copy_(new_row_sums)
+        ctx.args = (causal, scale, mask, q_bucket_size, k_bucket_size)
+        ctx.save_for_backward(q, k, v, o, all_row_sums, all_row_maxes)
+        return o
+    @staticmethod
+    @torch.no_grad()
+    def backward(ctx, do):
+        """ Algorithm 4 in the paper """
+        causal, scale, mask, q_bucket_size, k_bucket_size = ctx.args
+        q, k, v, o, l, m = ctx.saved_tensors
+        device = q.device
+        max_neg_value = -torch.finfo(q.dtype).max
+        qk_len_diff = max(k.shape[-2] - q.shape[-2], 0)
+        dq = torch.zeros_like(q)
+        dk = torch.zeros_like(k)
+        dv = torch.zeros_like(v)
+        row_splits = zip(
+            q.split(q_bucket_size, dim=-2),
+            o.split(q_bucket_size, dim=-2),
+            do.split(q_bucket_size, dim=-2),
+            mask,
+            l.split(q_bucket_size, dim=-2),
+            m.split(q_bucket_size, dim=-2),
+            dq.split(q_bucket_size, dim=-2)
+        )
+        for ind, (qc, oc, doc, row_mask, lc, mc, dqc) in enumerate(row_splits):
+            q_start_index = ind * q_bucket_size - qk_len_diff
+            col_splits = zip(
+                k.split(k_bucket_size, dim=-2),
+                v.split(k_bucket_size, dim=-2),
+                dk.split(k_bucket_size, dim=-2),
+                dv.split(k_bucket_size, dim=-2),
+            )
+            for k_ind, (kc, vc, dkc, dvc) in enumerate(col_splits):
+                k_start_index = k_ind * k_bucket_size
+                attn_weights = einsum(
+                    '... i d, ... j d -> ... i j', qc, kc) * scale
+                if causal and q_start_index < (k_start_index + k_bucket_size - 1):
+                    causal_mask = torch.ones((qc.shape[-2], kc.shape[-2]), dtype=torch.bool,
+                                             device=device).triu(q_start_index - k_start_index + 1)
+                    attn_weights.masked_fill_(causal_mask, max_neg_value)
+                exp_attn_weights = torch.exp(attn_weights - mc)
+                if exists(row_mask):
+                    exp_attn_weights.masked_fill_(~row_mask, 0.)
+                p = exp_attn_weights / lc
+                dv_chunk = einsum('... i j, ... i d -> ... j d', p, doc)
+                dp = einsum('... i d, ... j d -> ... i j', doc, vc)
+                D = (doc * oc).sum(dim=-1, keepdims=True)
+                ds = p * scale * (dp - D)
+                dq_chunk = einsum('... i j, ... j d -> ... i d', ds, kc)
+                dk_chunk = einsum('... i j, ... i d -> ... j d', ds, qc)
+                dqc.add_(dq_chunk)
+                dkc.add_(dk_chunk)
+                dvc.add_(dv_chunk)
+        return dq, dk, dv, None, None, None, None
+def import_model_class_from_model_name_or_path(pretrained_model_name_or_path: str, revision: str):
+    text_encoder_config = PretrainedConfig.from_pretrained(
+        pretrained_model_name_or_path,
+        subfolder="text_encoder",
+        revision=revision,
+    )
+    model_class = text_encoder_config.architectures[0]
+    if model_class == "CLIPTextModel":
+        from transformers import CLIPTextModel
+        return CLIPTextModel
+    elif model_class == "RobertaSeriesModelWithTransformation":
+        from diffusers.pipelines.alt_diffusion.modeling_roberta_series import RobertaSeriesModelWithTransformation
+        return RobertaSeriesModelWithTransformation
+    else:
+        raise ValueError(f"{model_class} is not supported.")
+def replace_unet_cross_attn_to_flash_attention():
+    print("Using FlashAttention")
+    def forward_flash_attn(self, x, context=None, mask=None):
+        q_bucket_size = 512
+        k_bucket_size = 1024
+        h = self.heads
+        q = self.to_q(x)
+        context = context if context is not None else x
+        context = context.to(x.dtype)
+        if hasattr(self, 'hypernetwork') and self.hypernetwork is not None:
+            context_k, context_v = self.hypernetwork.forward(x, context)
+            context_k = context_k.to(x.dtype)
+            context_v = context_v.to(x.dtype)
+        else:
+            context_k = context
+            context_v = context
+        k = self.to_k(context_k)
+        v = self.to_v(context_v)
+        del context, x
+        q, k, v = map(lambda t: rearrange(
+            t, 'b n (h d) -> b h n d', h=h), (q, k, v))
+        out = FlashAttentionFunction.apply(q, k, v, mask, False,
+                                           q_bucket_size, k_bucket_size)
+        out = rearrange(out, 'b h n d -> b n (h d)')
+        # diffusers 0.6.0
+        if type(self.to_out) is torch.nn.Sequential:
+            return self.to_out(out)
+        # diffusers 0.7.0
+        out = self.to_out[0](out)
+        out = self.to_out[1](out)
+        return out
+    diffusers.models.attention.CrossAttention.forward = forward_flash_attn
+class Depth2Img:
+    def __init__(self,unet,text_encoder,revision,pretrained_model_name_or_path,accelerator):
+        self.unet = unet
+        self.text_encoder = text_encoder
+        self.revision = revision if revision != 'no' else 'fp32'
+        self.pretrained_model_name_or_path = pretrained_model_name_or_path
+        self.accelerator = accelerator
+        self.pipeline = None
+    def depth_images(self,paths):
+        if self.pipeline is None:
+            self.pipeline = DiffusionPipeline.from_pretrained(
+                self.pretrained_model_name_or_path,
+                unet=self.accelerator.unwrap_model(self.unet),
+                text_encoder=self.accelerator.unwrap_model(self.text_encoder),
+                revision=self.revision,
+                local_files_only=True,)
+            self.pipeline.to(self.accelerator.device)
+            self.vae_scale_factor = 2 ** (len(self.pipeline.vae.config.block_out_channels) - 1)
+        non_depth_image_files = []
+        image_paths_by_path = {}
+        for path in paths:
+            #if path is list
+            if isinstance(path, list):
+                img = Path(path[0])
+            else:
+                img = Path(path)
+            if self.get_depth_image_path(img).exists():
+                continue
+            else:
+                non_depth_image_files.append(img)
+        image_objects = []
+        for image_path in non_depth_image_files:
+            image_instance = Image.open(image_path)
+            if not image_instance.mode == "RGB":
+                image_instance = image_instance.convert("RGB")
+            image_instance = self.pipeline.feature_extractor(
+                image_instance, return_tensors="pt"
+            ).pixel_values
+            image_instance = image_instance.to(self.accelerator.device)
+            image_objects.append((image_path, image_instance))
+        for image_path, image_instance in image_objects:
+            path = image_path.parent
+            ogImg = Image.open(image_path)
+            ogImg_x = ogImg.size[0]
+            ogImg_y = ogImg.size[1]
+            depth_map = self.pipeline.depth_estimator(image_instance).predicted_depth
+            depth_min = torch.amin(depth_map, dim=[0, 1, 2], keepdim=True)
+            depth_max = torch.amax(depth_map, dim=[0, 1, 2], keepdim=True)
+            depth_map = torch.nn.functional.interpolate(depth_map.unsqueeze(1),size=(ogImg_y, ogImg_x),mode="bicubic",align_corners=False,)
+            depth_map = 2.0 * (depth_map - depth_min) / (depth_max - depth_min) - 1.0
+            depth_map = depth_map[0,:,:]
+            depth_map_image = transforms.ToPILImage()(depth_map)
+            depth_map_image = depth_map_image.filter(ImageFilter.GaussianBlur(radius=1))
+            depth_map_image.save(self.get_depth_image_path(image_path))
+            #quit()
+        return 2 ** (len(self.pipeline.vae.config.block_out_channels) - 1)
+    def get_depth_image_path(self,image_path):
+        #if image_path is a string, convert it to a Path object
+        if isinstance(image_path, str):
+            image_path = Path(image_path)
+        return image_path.parent / f"{image_path.stem}-depth.png"
+def fix_nans_(param, name=None, stats=None):
+    (std, mean) = stats or (1, 0)
+    tqdm.write(name, param.shape, param.dtype, mean, std)
+    param.data = torch.where(param.data.isnan(), torch.randn_like(param.data) * std + mean, param.data).detach()