timbrooks pcuenq HF staff commited on
Commit
bd9110a
1 Parent(s): f0d1e32

diffusers-pipeline (#2)

Browse files

- Use Diffusers pipeline. (2b69f2a727338ff151654eecfbe4fbf7d7541c2c)
- Remove constraint on transformers. (2e3e5e89399c1d39c86dab315217faebf79c3ffe)


Co-authored-by: Pedro Cuenca <[email protected]>

Files changed (2) hide show
  1. edit_app.py +11 -89
  2. requirements.txt +3 -2
edit_app.py CHANGED
@@ -2,30 +2,18 @@ from __future__ import annotations
2
 
3
  import math
4
  import random
5
- import sys
6
 
7
- import einops
8
  import gradio as gr
9
- import k_diffusion as K
10
- import numpy as np
11
  import torch
12
- import torch.nn as nn
13
- from einops import rearrange
14
- from omegaconf import OmegaConf
15
  from PIL import Image, ImageOps
16
- from torch import autocast
17
- from huggingface_hub import hf_hub_download
18
-
19
- sys.path.append("./stable_diffusion")
20
-
21
- from stable_diffusion.ldm.util import instantiate_from_config
22
 
23
 
24
  help_text = """
25
  If you're not getting what you want, there may be a few reasons:
26
  1. Is the image not changing enough? Your Image CFG weight may be too high. This value dictates how similar the output should be to the input. It's possible your edit requires larger changes from the original image, and your Image CFG weight isn't allowing that. Alternatively, your Text CFG weight may be too low. This value dictates how much to listen to the text instruction. The default Image CFG of 1.5 and Text CFG of 7.5 are a good starting point, but aren't necessarily optimal for each edit. Try:
27
  * Decreasing the Image CFG weight, or
28
- * Incerasing the Text CFG weight, or
29
  2. Conversely, is the image changing too much, such that the details in the original image aren't preserved? Try:
30
  * Increasing the Image CFG weight, or
31
  * Decreasing the Text CFG weight
@@ -56,55 +44,10 @@ example_instructions = [
56
  "make him wear a beanie",
57
  ]
58
 
59
-
60
- class CFGDenoiser(nn.Module):
61
- def __init__(self, model):
62
- super().__init__()
63
- self.inner_model = model
64
-
65
- def forward(self, z, sigma, cond, uncond, text_cfg_scale, image_cfg_scale):
66
- cfg_z = einops.repeat(z, "1 ... -> n ...", n=3)
67
- cfg_sigma = einops.repeat(sigma, "1 ... -> n ...", n=3)
68
- cfg_cond = {
69
- "c_crossattn": [torch.cat([cond["c_crossattn"][0], uncond["c_crossattn"][0], uncond["c_crossattn"][0]])],
70
- "c_concat": [torch.cat([cond["c_concat"][0], cond["c_concat"][0], uncond["c_concat"][0]])],
71
- }
72
- out_cond, out_img_cond, out_uncond = self.inner_model(cfg_z, cfg_sigma, cond=cfg_cond).chunk(3)
73
- return out_uncond + text_cfg_scale * (out_cond - out_img_cond) + image_cfg_scale * (out_img_cond - out_uncond)
74
-
75
-
76
- def load_model_from_config(config, ckpt, vae_ckpt=None, verbose=False):
77
- print(f"Loading model from {ckpt}")
78
- pl_sd = torch.load(ckpt, map_location="cpu")
79
- if "global_step" in pl_sd:
80
- print(f"Global Step: {pl_sd['global_step']}")
81
- sd = pl_sd["state_dict"]
82
- if vae_ckpt is not None:
83
- print(f"Loading VAE from {vae_ckpt}")
84
- vae_sd = torch.load(vae_ckpt, map_location="cpu")["state_dict"]
85
- sd = {
86
- k: vae_sd[k[len("first_stage_model.") :]] if k.startswith("first_stage_model.") else v
87
- for k, v in sd.items()
88
- }
89
- model = instantiate_from_config(config.model)
90
- m, u = model.load_state_dict(sd, strict=False)
91
- if len(m) > 0 and verbose:
92
- print("missing keys:")
93
- print(m)
94
- if len(u) > 0 and verbose:
95
- print("unexpected keys:")
96
- print(u)
97
- return model
98
-
99
 
100
  def main():
101
- ckpt = hf_hub_download(repo_id="timbrooks/instruct-pix2pix", filename="instruct-pix2pix-00-22000.ckpt")
102
- config = OmegaConf.load("configs/generate.yaml")
103
- model = load_model_from_config(config, ckpt)
104
- model.eval().cuda()
105
- model_wrap = K.external.CompVisDenoiser(model)
106
- model_wrap_cfg = CFGDenoiser(model_wrap)
107
- null_token = model.get_learned_conditioning([""])
108
  example_image = Image.open("imgs/example.jpg").convert("RGB")
109
 
110
  def load_example(
@@ -151,34 +94,13 @@ def main():
151
  if instruction == "":
152
  return [input_image, seed]
153
 
154
- with torch.no_grad(), autocast("cuda"), model.ema_scope():
155
- cond = {}
156
- cond["c_crossattn"] = [model.get_learned_conditioning([instruction])]
157
- input_image = 2 * torch.tensor(np.array(input_image)).float() / 255 - 1
158
- input_image = rearrange(input_image, "h w c -> 1 c h w").to(model.device)
159
- cond["c_concat"] = [model.encode_first_stage(input_image).mode()]
160
-
161
- uncond = {}
162
- uncond["c_crossattn"] = [null_token]
163
- uncond["c_concat"] = [torch.zeros_like(cond["c_concat"][0])]
164
-
165
- sigmas = model_wrap.get_sigmas(steps)
166
-
167
- extra_args = {
168
- "cond": cond,
169
- "uncond": uncond,
170
- "text_cfg_scale": text_cfg_scale,
171
- "image_cfg_scale": image_cfg_scale,
172
- }
173
- torch.manual_seed(seed)
174
- z = torch.randn_like(cond["c_concat"][0]) * sigmas[0]
175
- z = K.sampling.sample_euler_ancestral(model_wrap_cfg, z, sigmas, extra_args=extra_args)
176
- x = model.decode_first_stage(z)
177
- x = torch.clamp((x + 1.0) / 2.0, min=0.0, max=1.0)
178
- x = 255.0 * rearrange(x, "1 c h w -> h w c")
179
- edited_image = Image.fromarray(x.type(torch.uint8).cpu().numpy())
180
-
181
- return [seed, text_cfg_scale, image_cfg_scale, edited_image]
182
 
183
  def reset():
184
  return [0, "Randomize Seed", 1371, "Fix CFG", 7.5, 1.5, None]
 
2
 
3
  import math
4
  import random
 
5
 
 
6
  import gradio as gr
 
 
7
  import torch
 
 
 
8
  from PIL import Image, ImageOps
9
+ from diffusers import StableDiffusionInstructPix2PixPipeline
 
 
 
 
 
10
 
11
 
12
  help_text = """
13
  If you're not getting what you want, there may be a few reasons:
14
  1. Is the image not changing enough? Your Image CFG weight may be too high. This value dictates how similar the output should be to the input. It's possible your edit requires larger changes from the original image, and your Image CFG weight isn't allowing that. Alternatively, your Text CFG weight may be too low. This value dictates how much to listen to the text instruction. The default Image CFG of 1.5 and Text CFG of 7.5 are a good starting point, but aren't necessarily optimal for each edit. Try:
15
  * Decreasing the Image CFG weight, or
16
+ * Increasing the Text CFG weight, or
17
  2. Conversely, is the image changing too much, such that the details in the original image aren't preserved? Try:
18
  * Increasing the Image CFG weight, or
19
  * Decreasing the Text CFG weight
 
44
  "make him wear a beanie",
45
  ]
46
 
47
+ model_id = "timbrooks/instruct-pix2pix"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
 
49
  def main():
50
+ pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained(model_id, torch_dtype=torch.float16, safety_checker=None).to("cuda")
 
 
 
 
 
 
51
  example_image = Image.open("imgs/example.jpg").convert("RGB")
52
 
53
  def load_example(
 
94
  if instruction == "":
95
  return [input_image, seed]
96
 
97
+ generator = torch.manual_seed(seed)
98
+ edited_image = pipe(
99
+ instruction, image=input_image,
100
+ guidance_scale=text_cfg_scale, image_guidance_scale=image_cfg_scale,
101
+ num_inference_steps=steps, generator=generator,
102
+ ).images[0]
103
+ return [seed, text_cfg_scale, image_cfg_scale, edited_image]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
 
105
  def reset():
106
  return [0, "Randomize Seed", 1371, "Fix CFG", 7.5, 1.5, None]
requirements.txt CHANGED
@@ -15,7 +15,7 @@ test-tube>=0.7.5
15
  streamlit>=0.73.1
16
  einops==0.3.0
17
  torch-fidelity==0.3.0
18
- transformers==4.19.2
19
  torchmetrics==0.6.0
20
  kornia==0.6
21
  -e git+https://github.com/CompVis/taming-transformers.git@master#egg=taming-transformers
@@ -23,4 +23,5 @@ kornia==0.6
23
  huggingface-hub
24
  openai
25
  seaborn
26
- git+https://github.com/crowsonkb/k-diffusion.git
 
 
15
  streamlit>=0.73.1
16
  einops==0.3.0
17
  torch-fidelity==0.3.0
18
+ transformers
19
  torchmetrics==0.6.0
20
  kornia==0.6
21
  -e git+https://github.com/CompVis/taming-transformers.git@master#egg=taming-transformers
 
23
  huggingface-hub
24
  openai
25
  seaborn
26
+ git+https://github.com/crowsonkb/k-diffusion.git
27
+ git+https://github.com/huggingface/diffusers