instruct-pix2pix

Paused

App Files Files Community

timbrooks

pcuenq HF staff commited on Jan 20, 2023

Commit

bd9110a

•

1 Parent(s): f0d1e32

diffusers-pipeline (#2)

Browse files

- Use Diffusers pipeline. (2b69f2a727338ff151654eecfbe4fbf7d7541c2c)
- Remove constraint on transformers. (2e3e5e89399c1d39c86dab315217faebf79c3ffe)

Co-authored-by: Pedro Cuenca <[email protected]>

Files changed (2) hide show

edit_app.py +11 -89
requirements.txt +3 -2

edit_app.py CHANGED Viewed

@@ -2,30 +2,18 @@ from __future__ import annotations
 import math
 import random
-import sys
-import einops
 import gradio as gr
-import k_diffusion as K
-import numpy as np
 import torch
-import torch.nn as nn
-from einops import rearrange
-from omegaconf import OmegaConf
 from PIL import Image, ImageOps
-from torch import autocast
-from huggingface_hub import hf_hub_download
-sys.path.append("./stable_diffusion")
-from stable_diffusion.ldm.util import instantiate_from_config
 help_text = """
 If you're not getting what you want, there may be a few reasons:
 1. Is the image not changing enough? Your Image CFG weight may be too high. This value dictates how similar the output should be to the input. It's possible your edit requires larger changes from the original image, and your Image CFG weight isn't allowing that. Alternatively, your Text CFG weight may be too low. This value dictates how much to listen to the text instruction. The default Image CFG of 1.5 and Text CFG of 7.5 are a good starting point, but aren't necessarily optimal for each edit. Try:
     * Decreasing the Image CFG weight, or
-    * Incerasing the Text CFG weight, or
 2. Conversely, is the image changing too much, such that the details in the original image aren't preserved? Try:
     * Increasing the Image CFG weight, or
     * Decreasing the Text CFG weight
@@ -56,55 +44,10 @@ example_instructions = [
     "make him wear a beanie",
 ]
-class CFGDenoiser(nn.Module):
-    def __init__(self, model):
-        super().__init__()
-        self.inner_model = model
-    def forward(self, z, sigma, cond, uncond, text_cfg_scale, image_cfg_scale):
-        cfg_z = einops.repeat(z, "1 ... -> n ...", n=3)
-        cfg_sigma = einops.repeat(sigma, "1 ... -> n ...", n=3)
-        cfg_cond = {
-            "c_crossattn": [torch.cat([cond["c_crossattn"][0], uncond["c_crossattn"][0], uncond["c_crossattn"][0]])],
-            "c_concat": [torch.cat([cond["c_concat"][0], cond["c_concat"][0], uncond["c_concat"][0]])],
-        }
-        out_cond, out_img_cond, out_uncond = self.inner_model(cfg_z, cfg_sigma, cond=cfg_cond).chunk(3)
-        return out_uncond + text_cfg_scale * (out_cond - out_img_cond) + image_cfg_scale * (out_img_cond - out_uncond)
-def load_model_from_config(config, ckpt, vae_ckpt=None, verbose=False):
-    print(f"Loading model from {ckpt}")
-    pl_sd = torch.load(ckpt, map_location="cpu")
-    if "global_step" in pl_sd:
-        print(f"Global Step: {pl_sd['global_step']}")
-    sd = pl_sd["state_dict"]
-    if vae_ckpt is not None:
-        print(f"Loading VAE from {vae_ckpt}")
-        vae_sd = torch.load(vae_ckpt, map_location="cpu")["state_dict"]
-        sd = {
-            k: vae_sd[k[len("first_stage_model.") :]] if k.startswith("first_stage_model.") else v
-            for k, v in sd.items()
-        }
-    model = instantiate_from_config(config.model)
-    m, u = model.load_state_dict(sd, strict=False)
-    if len(m) > 0 and verbose:
-        print("missing keys:")
-        print(m)
-    if len(u) > 0 and verbose:
-        print("unexpected keys:")
-        print(u)
-    return model
 def main():
-    ckpt = hf_hub_download(repo_id="timbrooks/instruct-pix2pix", filename="instruct-pix2pix-00-22000.ckpt")
-    config = OmegaConf.load("configs/generate.yaml")
-    model = load_model_from_config(config, ckpt)
-    model.eval().cuda()
-    model_wrap = K.external.CompVisDenoiser(model)
-    model_wrap_cfg = CFGDenoiser(model_wrap)
-    null_token = model.get_learned_conditioning([""])
     example_image = Image.open("imgs/example.jpg").convert("RGB")
     def load_example(
@@ -151,34 +94,13 @@ def main():
         if instruction == "":
             return [input_image, seed]
-        with torch.no_grad(), autocast("cuda"), model.ema_scope():
-            cond = {}
-            cond["c_crossattn"] = [model.get_learned_conditioning([instruction])]
-            input_image = 2 * torch.tensor(np.array(input_image)).float() / 255 - 1
-            input_image = rearrange(input_image, "h w c -> 1 c h w").to(model.device)
-            cond["c_concat"] = [model.encode_first_stage(input_image).mode()]
-            uncond = {}
-            uncond["c_crossattn"] = [null_token]
-            uncond["c_concat"] = [torch.zeros_like(cond["c_concat"][0])]
-            sigmas = model_wrap.get_sigmas(steps)
-            extra_args = {
-                "cond": cond,
-                "uncond": uncond,
-                "text_cfg_scale": text_cfg_scale,
-                "image_cfg_scale": image_cfg_scale,
-            }
-            torch.manual_seed(seed)
-            z = torch.randn_like(cond["c_concat"][0]) * sigmas[0]
-            z = K.sampling.sample_euler_ancestral(model_wrap_cfg, z, sigmas, extra_args=extra_args)
-            x = model.decode_first_stage(z)
-            x = torch.clamp((x + 1.0) / 2.0, min=0.0, max=1.0)
-            x = 255.0 * rearrange(x, "1 c h w -> h w c")
-            edited_image = Image.fromarray(x.type(torch.uint8).cpu().numpy())
-            return [seed, text_cfg_scale, image_cfg_scale, edited_image]
     def reset():
         return [0, "Randomize Seed", 1371, "Fix CFG", 7.5, 1.5, None]

 import math
 import random
 import gradio as gr
 import torch
 from PIL import Image, ImageOps
+from diffusers import StableDiffusionInstructPix2PixPipeline
 help_text = """
 If you're not getting what you want, there may be a few reasons:
 1. Is the image not changing enough? Your Image CFG weight may be too high. This value dictates how similar the output should be to the input. It's possible your edit requires larger changes from the original image, and your Image CFG weight isn't allowing that. Alternatively, your Text CFG weight may be too low. This value dictates how much to listen to the text instruction. The default Image CFG of 1.5 and Text CFG of 7.5 are a good starting point, but aren't necessarily optimal for each edit. Try:
     * Decreasing the Image CFG weight, or
+    * Increasing the Text CFG weight, or
 2. Conversely, is the image changing too much, such that the details in the original image aren't preserved? Try:
     * Increasing the Image CFG weight, or
     * Decreasing the Text CFG weight
     "make him wear a beanie",
 ]
+model_id = "timbrooks/instruct-pix2pix"
 def main():
+    pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained(model_id, torch_dtype=torch.float16, safety_checker=None).to("cuda")
     example_image = Image.open("imgs/example.jpg").convert("RGB")
     def load_example(
         if instruction == "":
             return [input_image, seed]
+        generator = torch.manual_seed(seed)
+        edited_image = pipe(
+            instruction, image=input_image,
+            guidance_scale=text_cfg_scale, image_guidance_scale=image_cfg_scale,
+            num_inference_steps=steps, generator=generator,
+        ).images[0]
+        return [seed, text_cfg_scale, image_cfg_scale, edited_image]
     def reset():
         return [0, "Randomize Seed", 1371, "Fix CFG", 7.5, 1.5, None]

requirements.txt CHANGED Viewed

@@ -15,7 +15,7 @@ test-tube>=0.7.5
 streamlit>=0.73.1
 einops==0.3.0
 torch-fidelity==0.3.0
-transformers==4.19.2
 torchmetrics==0.6.0
 kornia==0.6
 -e git+https://github.com/CompVis/taming-transformers.git@master#egg=taming-transformers
@@ -23,4 +23,5 @@ kornia==0.6
 huggingface-hub
 openai
 seaborn
-git+https://github.com/crowsonkb/k-diffusion.git

 streamlit>=0.73.1
 einops==0.3.0
 torch-fidelity==0.3.0
+transformers
 torchmetrics==0.6.0
 kornia==0.6
 -e git+https://github.com/CompVis/taming-transformers.git@master#egg=taming-transformers
 huggingface-hub
 openai
 seaborn
+git+https://github.com/crowsonkb/k-diffusion.git
+git+https://github.com/huggingface/diffusers