Spaces:
Runtime error
Runtime error
import torch | |
import torch.nn as nn | |
import numpy as np | |
from sampler import DDPMSampler | |
from tqdm import tqdm | |
from PIL import Image | |
WIDTH = 512 | |
HEIGHT = 512 | |
LATENTS_WIDTH = WIDTH // 8 | |
LATENTS_HEIGHT = HEIGHT // 8 | |
def generate( | |
prompt, | |
uncond_prompt=None, | |
input_image=None, | |
strength=0.8, | |
do_cfg=True, | |
cfg_scale=7.5, | |
sampler_name="ddpm", | |
n_inference_steps=50, | |
models={}, | |
seed=None, | |
device=None, | |
idle_device=None, | |
tokenizer=None, | |
): | |
with torch.no_grad(): | |
if not 0 < strength <= 1: | |
raise ValueError("strength must be between 0 and 1") | |
if idle_device: | |
to_idle = lambda x: x.to(idle_device) | |
else: | |
to_idle = lambda x: x | |
# Initialize random number generator according to the seed specified | |
generator = torch.Generator(device=device) | |
if seed is None: | |
generator.seed() | |
else: | |
generator.manual_seed(seed) | |
clip = models["clip"] | |
clip.to(device) | |
if do_cfg: | |
# Convert into a list of length Seq_Len=77 | |
cond_tokens = tokenizer.batch_encode_plus( | |
[prompt], padding="max_length", max_length=77 | |
).input_ids | |
# (Batch_Size, Seq_Len) | |
cond_tokens = torch.tensor(cond_tokens, dtype=torch.long, device=device) | |
# (Batch_Size, Seq_Len) -> (Batch_Size, Seq_Len, Dim) | |
cond_context = clip(cond_tokens) | |
# Convert into a list of length Seq_Len=77 | |
uncond_tokens = tokenizer.batch_encode_plus( | |
[uncond_prompt], padding="max_length", max_length=77 | |
).input_ids | |
# (Batch_Size, Seq_Len) | |
uncond_tokens = torch.tensor(uncond_tokens, dtype=torch.long, device=device) | |
# (Batch_Size, Seq_Len) -> (Batch_Size, Seq_Len, Dim) | |
uncond_context = clip(uncond_tokens) | |
# (Batch_Size, Seq_Len, Dim) + (Batch_Size, Seq_Len, Dim) -> (2 * Batch_Size, Seq_Len, Dim) | |
context = torch.cat([cond_context, uncond_context]) | |
else: | |
# Convert into a list of length Seq_Len=77 | |
tokens = tokenizer.batch_encode_plus( | |
[prompt], padding="max_length", max_length=77 | |
).input_ids | |
# (Batch_Size, Seq_Len) | |
tokens = torch.tensor(tokens, dtype=torch.long, device=device) | |
# (Batch_Size, Seq_Len) -> (Batch_Size, Seq_Len, Dim) | |
context = clip(tokens) | |
to_idle(clip) | |
if sampler_name == "ddpm": | |
sampler = DDPMSampler(generator) | |
sampler.set_inference_timesteps(n_inference_steps) | |
else: | |
raise ValueError("Unknown sampler value %s. ") | |
latents_shape = (1, 4, LATENTS_HEIGHT, LATENTS_WIDTH) | |
if input_image: | |
input_image = Image.open(input_image) | |
encoder = models["encoder"] | |
encoder.to(device) | |
input_image_tensor = input_image.resize((WIDTH, HEIGHT)) | |
# (Height, Width, Channel) | |
input_image_tensor = np.array(input_image_tensor) | |
# (Height, Width, Channel) -> (Height, Width, Channel) | |
input_image_tensor = torch.tensor(input_image_tensor, dtype=torch.float32, device=device) | |
# (Height, Width, Channel) -> (Height, Width, Channel) | |
input_image_tensor = rescale(input_image_tensor, (0, 255), (-1, 1)) | |
# (Height, Width, Channel) -> (Batch_Size, Height, Width, Channel) | |
input_image_tensor = input_image_tensor.unsqueeze(0) | |
# (Batch_Size, Height, Width, Channel) -> (Batch_Size, Channel, Height, Width) | |
input_image_tensor = input_image_tensor.permute(0, 3, 1, 2) | |
# (Batch_Size, 4, Latents_Height, Latents_Width) | |
encoder_noise = torch.randn(latents_shape, generator=generator, device=device) | |
# (Batch_Size, 4, Latents_Height, Latents_Width) | |
latents = encoder(input_image_tensor, encoder_noise) | |
# Add noise to the latents (the encoded input image) | |
# (Batch_Size, 4, Latents_Height, Latents_Width) | |
sampler.set_strength(strength=strength) | |
latents = sampler.add_noise(latents, sampler.timesteps[0]) | |
to_idle(encoder) | |
else: | |
# (Batch_Size, 4, Latents_Height, Latents_Width) | |
latents = torch.randn(latents_shape, generator=generator, device=device) | |
diffusion = models["diffusion"] | |
diffusion.to(device) | |
timesteps = tqdm(sampler.timesteps) | |
for i, timestep in enumerate(timesteps): | |
# (1, 320) | |
time_embedding = get_time_embedding(timestep).to(device) | |
# (Batch_Size, 4, Latents_Height, Latents_Width) | |
model_input = latents | |
if do_cfg: | |
# (Batch_Size, 4, Latents_Height, Latents_Width) -> (2 * Batch_Size, 4, Latents_Height, Latents_Width) | |
model_input = model_input.repeat(2, 1, 1, 1) | |
# model_output is the predicted noise | |
# (Batch_Size, 4, Latents_Height, Latents_Width) -> (Batch_Size, 4, Latents_Height, Latents_Width) | |
model_output = diffusion(model_input, context, time_embedding) | |
if do_cfg: | |
output_cond, output_uncond = model_output.chunk(2) | |
model_output = cfg_scale * (output_cond - output_uncond) + output_uncond | |
# (Batch_Size, 4, Latents_Height, Latents_Width) -> (Batch_Size, 4, Latents_Height, Latents_Width) | |
latents = sampler.step(timestep, latents, model_output) | |
to_idle(diffusion) | |
decoder = models["decoder"] | |
decoder.to(device) | |
# (Batch_Size, 4, Latents_Height, Latents_Width) -> (Batch_Size, 3, Height, Width) | |
images = decoder(latents) | |
to_idle(decoder) | |
images = rescale(images, (-1, 1), (0, 255), clamp=True) | |
# (Batch_Size, Channel, Height, Width) -> (Batch_Size, Height, Width, Channel) | |
images = images.permute(0, 2, 3, 1) | |
images = images.to("cpu", torch.uint8).numpy() | |
return images[0] | |
def rescale(x, old_range, new_range, clamp=False): | |
old_min, old_max = old_range | |
new_min, new_max = new_range | |
x -= old_min | |
x *= (new_max - new_min) / (old_max - old_min) | |
x += new_min | |
if clamp: | |
x = x.clamp(new_min, new_max) | |
return x | |
def get_time_embedding(timestep): | |
# Shape: (160,) | |
freqs = torch.pow(10000, -torch.arange(start=0, end=160, dtype=torch.float32) / 160) | |
# Shape: (1, 160) | |
x = torch.tensor([timestep], dtype=torch.float32)[:, None] * freqs[None] | |
# Shape: (1, 160 * 2) | |
return torch.cat([torch.cos(x), torch.sin(x)], dim=-1) |